55 files changed, 1306 insertions, 236 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6af2cba10093..8c0f51145139 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -314,6 +314,7 @@ set(LLVM_CMAKE_PATH ${LLVM_MAIN_SRC_DIR}/cmake/modules)
 set(LLVM_EXAMPLES_BINARY_DIR ${LLVM_BINARY_DIR}/examples)
 set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
 
+# List of all targets to be built by default:
 set(LLVM_ALL_TARGETS
   AArch64
   AMDGPU
@@ -325,7 +326,6 @@ set(LLVM_ALL_TARGETS
   MSP430
   NVPTX
   PowerPC
-  RISCV
   Sparc
   SystemZ
   X86
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 44efc1498060..5c65864e901e 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -5369,6 +5369,10 @@ The following behaviors are supported:
            nodes. However, duplicate entries in the second list are dropped
            during the append operation.
 
+   * - 7
+     - **Max**
+           Takes the max of the two values, which are required to be integers.
+
 It is an error for a particular unique flag ID to have multiple behaviors,
 except in the case of **Require** (which adds restrictions on another metadata
 value) or **Override**.
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index dcd2ec7eb22b..48af491f1214 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -117,6 +117,18 @@ Changes to the X86 Target
 
 * Added support for AMD Lightweight Profiling (LWP) instructions.
 
+* Avoid using slow LEA instructions.
+
+* Use alternative sequences for multiply by constant.
+
+* Improved lowering of strided shuffles.
+
+* Improved the AVX512 cost model used by the vectorizer.
+
+* Fix scalar code performance when AVX512 is enabled by making i1's illegal.
+
+* Fixed many inline assembly bugs.
+
 Changes to the AMDGPU Target
 -----------------------------
 
@@ -160,7 +172,29 @@ Changes to the C API
 External Open Source Projects Using LLVM 5
 ==========================================
 
-* A project...
+Zig Programming Language
+------------------------
+
+`Zig <http://ziglang.org>`_  is an open-source programming language designed
+for robustness, optimality, and clarity. It integrates closely with C and is
+intended to eventually take the place of C. It uses LLVM to produce highly
+optimized native code and to cross-compile for any target out of the box. Zig
+is in alpha; with a beta release expected in September.
+
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
+and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
+are underway.
 
 
 Additional Information
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 55a23c3cca9b..d6851f7143a5 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -1220,8 +1220,9 @@ public:
   /// If an existing load has uses of its chain, create a token factor node with
   /// that chain and the new memory node's chain and update users of the old
   /// chain to the token factor. This ensures that the new memory node will have
-  /// the same relative memory dependency position as the old load.
-  void makeEquivalentMemoryOrdering(LoadSDNode *Old, SDValue New);
+  /// the same relative memory dependency position as the old load. Returns the
+  /// new merged load chain.
+  SDValue makeEquivalentMemoryOrdering(LoadSDNode *Old, SDValue New);
 
   /// Topological-sort the AllNodes list and a
   /// assign a unique node id for each node in the DAG based on their
diff --git a/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h b/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
index 6c951fab6185..b7e462e85d9d 100644
--- a/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
@@ -94,9 +94,9 @@ private:
       llvm_unreachable("Invalid emit-state.");
     }
 
-    void removeModuleFromBaseLayer(BaseLayerT &BaseLayer) {
-      if (EmitState != NotEmitted)
-        BaseLayer.removeModule(Handle);
+    Error removeModuleFromBaseLayer(BaseLayerT& BaseLayer) {
+      return EmitState != NotEmitted ? BaseLayer.removeModule(Handle)
+                                     : Error::success();
     }
 
     void emitAndFinalize(BaseLayerT &BaseLayer) {
@@ -226,9 +226,9 @@ public:
   ///   This method will free the memory associated with the given module, both
   /// in this layer, and the base layer.
   Error removeModule(ModuleHandleT H) {
-    (*H)->removeModuleFromBaseLayer(BaseLayer);
+    Error Err = (*H)->removeModuleFromBaseLayer(BaseLayer);
     ModuleList.erase(H);
-    return Error::success();
+    return Err;
   }
 
   /// @brief Search for the given named symbol.
diff --git a/include/llvm/Object/COFFImportFile.h b/include/llvm/Object/COFFImportFile.h
index 8e215b565fc4..cf9c80a06f49 100644
--- a/include/llvm/Object/COFFImportFile.h
+++ b/include/llvm/Object/COFFImportFile.h
@@ -73,6 +73,7 @@ private:
 struct COFFShortExport {
   std::string Name;
   std::string ExtName;
+  std::string SymbolName;
 
   uint16_t Ordinal = 0;
   bool Noname = false;
@@ -98,7 +99,8 @@ struct COFFShortExport {
 std::error_code writeImportLibrary(StringRef ImportName,
                                    StringRef Path,
                                    ArrayRef<COFFShortExport> Exports,
-                                   COFF::MachineTypes Machine);
+                                   COFF::MachineTypes Machine,
+                                   bool MakeWeakAliases);
 
 } // namespace object
 } // namespace llvm
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index b973203a89b6..9539fd7c7559 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -162,6 +162,11 @@ static cl::opt<unsigned>
                 cl::desc("Maximum depth of recursive SExt/ZExt"),
                 cl::init(8));
 
+static cl::opt<unsigned>
+    MaxAddRecSize("scalar-evolution-max-add-rec-size", cl::Hidden,
+                  cl::desc("Max coefficients in AddRec during evolving"),
+                  cl::init(16));
+
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -2878,6 +2883,12 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       if (!OtherAddRec || OtherAddRec->getLoop() != AddRecLoop)
         continue;
 
+      // Limit max number of arguments to avoid creation of unreasonably big
+      // SCEVAddRecs with very complex operands.
+      if (AddRec->getNumOperands() + OtherAddRec->getNumOperands() - 1 >
+          MaxAddRecSize)
+        continue;
+
       bool Overflow = false;
       Type *Ty = AddRec->getType();
       bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
@@ -7582,6 +7593,25 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
             const SCEV *BackedgeTakenCount = getBackedgeTakenCount(LI);
             if (const SCEVConstant *BTCC =
                   dyn_cast<SCEVConstant>(BackedgeTakenCount)) {
+
+              // This trivial case can show up in some degenerate cases where
+              // the incoming IR has not yet been fully simplified.
+              if (BTCC->getValue()->isZero()) {
+                Value *InitValue = nullptr;
+                bool MultipleInitValues = false;
+                for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+                  if (!LI->contains(PN->getIncomingBlock(i))) {
+                    if (!InitValue)
+                      InitValue = PN->getIncomingValue(i);
+                    else if (InitValue != PN->getIncomingValue(i)) {
+                      MultipleInitValues = true;
+                      break;
+                    }
+                  }
+                  if (!MultipleInitValues && InitValue)
+                    return getSCEV(InitValue);
+                }
+              }
               // Okay, we know how many times the containing loop executes.  If
               // this is a constant evolving PHI node, get the final value at
               // the specified iteration number.
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 439b21a81258..cdfe74d158c9 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -4458,6 +4458,10 @@ Optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
                                         unsigned Depth, AssumptionCache *AC,
                                         const Instruction *CxtI,
                                         const DominatorTree *DT) {
+  // Bail out when we hit the limit.
+  if (Depth == MaxDepth)
+    return None;
+
   // A mismatch occurs when we compare a scalar cmp to a vector cmp, for example.
   if (LHS->getType() != RHS->getType())
     return None;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 0cad20db0964..ecb54e1e4b41 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -302,7 +302,21 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
-  SDValue Cond = GetScalarizedVector(N->getOperand(0));
+  SDValue Cond = N->getOperand(0);
+  EVT OpVT = Cond.getValueType();
+  SDLoc DL(N);
+  // The vselect result and true/value operands needs scalarizing, but it's
+  // not a given that the Cond does. For instance, in AVX512 v1i1 is legal.
+  // See the similar logic in ScalarizeVecRes_VSETCC
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Cond = GetScalarizedVector(Cond);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    Cond = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, VT, Cond,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
   TargetLowering::BooleanContent ScalarBool =
       TLI.getBooleanContents(false, false);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 823e77850c4b..0ff154784f68 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7262,22 +7262,23 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
     AddDbgValue(I, ToNode, false);
 }
 
-void SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
-                                                SDValue NewMemOp) {
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+                                                   SDValue NewMemOp) {
   assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
-  if (!OldLoad->hasAnyUseOfValue(1))
-    return;
-
   // The new memory operation must have the same position as the old load in
   // terms of memory dependency. Create a TokenFactor for the old load and new
   // memory operation and update uses of the old load's output chain to use that
   // TokenFactor.
   SDValue OldChain = SDValue(OldLoad, 1);
   SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
+  if (!OldLoad->hasAnyUseOfValue(1))
+    return NewChain;
+
   SDValue TokenFactor =
       getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain);
   ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
   UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
+  return TokenFactor;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 124c2790f68c..f8aacdb8649d 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -180,6 +180,7 @@ class VirtRegRewriter : public MachineFunctionPass {
   void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;
   void handleIdentityCopy(MachineInstr &MI) const;
   void expandCopyBundle(MachineInstr &MI) const;
+  bool subRegLiveThrough(const MachineInstr &MI, unsigned SuperPhysReg) const;
 
 public:
   static char ID;
@@ -415,6 +416,32 @@ void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
   }
 }
 
+/// Check whether (part of) \p SuperPhysReg is live through \p MI.
+/// \pre \p MI defines a subregister of a virtual register that
+/// has been assigned to \p SuperPhysReg.
+bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
+                                        unsigned SuperPhysReg) const {
+  SlotIndex MIIndex = LIS->getInstructionIndex(MI);
+  SlotIndex BeforeMIUses = MIIndex.getBaseIndex();
+  SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex();
+  for (MCRegUnitIterator Unit(SuperPhysReg, TRI); Unit.isValid(); ++Unit) {
+    const LiveRange &UnitRange = LIS->getRegUnit(*Unit);
+    // If the regunit is live both before and after MI,
+    // we assume it is live through.
+    // Generally speaking, this is not true, because something like
+    // "RU = op RU" would match that description.
+    // However, we know that we are trying to assess whether
+    // a def of a virtual reg, vreg, is live at the same time of RU.
+    // If we are in the "RU = op RU" situation, that means that vreg
+    // is defined at the same time as RU (i.e., "vreg, RU = op RU").
+    // Thus, vreg and RU interferes and vreg cannot be assigned to
+    // SuperPhysReg. Therefore, this situation cannot happen.
+    if (UnitRange.liveAt(AfterMIDefs) && UnitRange.liveAt(BeforeMIUses))
+      return true;
+  }
+  return false;
+}
+
 void VirtRegRewriter::rewrite() {
   bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
   SmallVector<unsigned, 8> SuperDeads;
@@ -452,7 +479,8 @@ void VirtRegRewriter::rewrite() {
             // A virtual register kill refers to the whole register, so we may
             // have to add <imp-use,kill> operands for the super-register.  A
             // partial redef always kills and redefines the super-register.
-            if (MO.readsReg() && (MO.isDef() || MO.isKill()))
+            if ((MO.readsReg() && (MO.isDef() || MO.isKill())) ||
+                (MO.isDef() && subRegLiveThrough(*MI, PhysReg)))
               SuperKills.push_back(PhysReg);
 
             if (MO.isDef()) {
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 495e09fbae35..dd3235244e24 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -134,13 +134,13 @@ dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName,
       uint64_t StringOffset =
           StrOffsetExt.getRelocatedValue(EntrySize, &Offset);
       if (Format == DWARF32) {
-        OS << format("%8.8x ", StringOffset);
         uint32_t StringOffset32 = (uint32_t)StringOffset;
+        OS << format("%8.8x ", StringOffset32);
         const char *S = StrData.getCStr(&StringOffset32);
         if (S)
           OS << format("\"%s\"", S);
       } else
-        OS << format("%16.16x ", StringOffset);
+        OS << format("%16.16" PRIx64 " ", StringOffset);
       OS << "\n";
     }
   }
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 6cf44ffa3796..4de46bea301e 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -196,7 +196,7 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
         ++NumErrors;
         OS << "error: DW_AT_stmt_list offset is beyond .debug_line "
               "bounds: "
-           << format("0x%08" PRIx32, *SectionOffset) << "\n";
+           << format("0x%08" PRIx64, *SectionOffset) << "\n";
         Die.dump(OS, 0);
         OS << "\n";
       }
@@ -234,7 +234,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
       if (CUOffset >= CUSize) {
         ++NumErrors;
         OS << "error: " << FormEncodingString(Form) << " CU offset "
-           << format("0x%08" PRIx32, CUOffset)
+           << format("0x%08" PRIx64, CUOffset)
            << " is invalid (must be less than CU size of "
            << format("0x%08" PRIx32, CUSize) << "):\n";
         Die.dump(OS, 0);
@@ -366,7 +366,7 @@ void DWARFVerifier::verifyDebugLineRows() {
       if (Row.Address < PrevAddress) {
         ++NumDebugLineErrors;
         OS << "error: .debug_line["
-           << format("0x%08" PRIx32,
+           << format("0x%08" PRIx64,
                      *toSectionOffset(Die.find(DW_AT_stmt_list)))
            << "] row[" << RowIndex
            << "] decreases in address from previous row:\n";
@@ -381,7 +381,7 @@ void DWARFVerifier::verifyDebugLineRows() {
       if (Row.File > MaxFileIndex) {
         ++NumDebugLineErrors;
         OS << "error: .debug_line["
-           << format("0x%08" PRIx32,
+           << format("0x%08" PRIx64,
                      *toSectionOffset(Die.find(DW_AT_stmt_list)))
            << "][" << RowIndex << "] has invalid file index " << Row.File
            << " (valid values are [1," << MaxFileIndex << "]):\n";
diff --git a/lib/Object/COFFImportFile.cpp b/lib/Object/COFFImportFile.cpp
index a515bc8ad16d..ff039463d08c 100644
--- a/lib/Object/COFFImportFile.cpp
+++ b/lib/Object/COFFImportFile.cpp
@@ -557,7 +557,7 @@ NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
 
 std::error_code writeImportLibrary(StringRef ImportName, StringRef Path,
                                    ArrayRef<COFFShortExport> Exports,
-                                   MachineTypes Machine) {
+                                   MachineTypes Machine, bool MakeWeakAliases) {
 
   std::vector<NewArchiveMember> Members;
   ObjectFactory OF(llvm::sys::path::filename(ImportName), Machine);
@@ -575,7 +575,7 @@ std::error_code writeImportLibrary(StringRef ImportName, StringRef Path,
     if (E.Private)
       continue;
 
-    if (E.isWeak()) {
+    if (E.isWeak() && MakeWeakAliases) {
       Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, false));
       Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, true));
       continue;
@@ -587,7 +587,7 @@ std::error_code writeImportLibrary(StringRef ImportName, StringRef Path,
     if (E.Constant)
       ImportType = IMPORT_CONST;
 
-    StringRef SymbolName = E.isWeak() ? E.ExtName : E.Name;
+    StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName;
     ImportNameType NameType = getNameType(SymbolName, E.Name, Machine);
     Expected<std::string> Name = E.ExtName.empty()
                                      ? SymbolName
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 005f2d51e403..9a7f45bde6c9 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -388,6 +388,10 @@ static unsigned isMatchingStore(MachineInstr &LoadInst,
 }
 
 static unsigned getPreIndexedOpcode(unsigned Opc) {
+  // FIXME: We don't currently support creating pre-indexed loads/stores when
+  // the load or store is the unscaled version.  If we decide to perform such an
+  // optimization in the future the cases for the unscaled loads/stores will
+  // need to be added here.
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no pre-indexed equivalent!");
@@ -451,32 +455,42 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
   default:
     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
   case AArch64::STRSui:
+  case AArch64::STURSi:
     return AArch64::STRSpost;
   case AArch64::STRDui:
+  case AArch64::STURDi:
     return AArch64::STRDpost;
   case AArch64::STRQui:
+  case AArch64::STURQi:
     return AArch64::STRQpost;
   case AArch64::STRBBui:
     return AArch64::STRBBpost;
   case AArch64::STRHHui:
     return AArch64::STRHHpost;
   case AArch64::STRWui:
+  case AArch64::STURWi:
     return AArch64::STRWpost;
   case AArch64::STRXui:
+  case AArch64::STURXi:
     return AArch64::STRXpost;
   case AArch64::LDRSui:
+  case AArch64::LDURSi:
     return AArch64::LDRSpost;
   case AArch64::LDRDui:
+  case AArch64::LDURDi:
     return AArch64::LDRDpost;
   case AArch64::LDRQui:
+  case AArch64::LDURQi:
     return AArch64::LDRQpost;
   case AArch64::LDRBBui:
     return AArch64::LDRBBpost;
   case AArch64::LDRHHui:
     return AArch64::LDRHHpost;
   case AArch64::LDRWui:
+  case AArch64::LDURWi:
     return AArch64::LDRWpost;
   case AArch64::LDRXui:
+  case AArch64::LDURXi:
     return AArch64::LDRXpost;
   case AArch64::LDRSWui:
     return AArch64::LDRSWpost;
@@ -1694,8 +1708,9 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
         ++NumPostFolded;
         break;
       }
-      // Don't know how to handle pre/post-index versions, so move to the next
-      // instruction.
+
+      // Don't know how to handle unscaled pre/post-index versions below, so
+      // move to the next instruction.
       if (TII->isUnscaledLdSt(Opc)) {
         ++MBBI;
         break;
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index ec49f0d37af4..46d8f0dba691 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -769,8 +769,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
   const MachineOperand &Dest = MI.getOperand(0);
-  unsigned StatusReg = MI.getOperand(1).getReg();
-  bool StatusDead = MI.getOperand(1).isDead();
+  unsigned TempReg = MI.getOperand(1).getReg();
   // Duplicating undef operands into 2 instructions does not guarantee the same
   // value on both; However undef should be replaced by xzr anyway.
   assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
@@ -797,23 +796,9 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   }
 
   // .Lloadcmp:
-  //     mov wStatus, #0
   //     ldrex rDest, [rAddr]
   //     cmp rDest, rDesired
   //     bne .Ldone
-  if (!StatusDead) {
-    if (IsThumb) {
-      BuildMI(LoadCmpBB, DL, TII->get(ARM::tMOVi8), StatusReg)
-        .addDef(ARM::CPSR, RegState::Dead)
-        .addImm(0)
-        .add(predOps(ARMCC::AL));
-    } else {
-      BuildMI(LoadCmpBB, DL, TII->get(ARM::MOVi), StatusReg)
-        .addImm(0)
-        .add(predOps(ARMCC::AL))
-        .add(condCodeOp());
-    }
-  }
 
   MachineInstrBuilder MIB;
   MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg());
@@ -836,10 +821,10 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   LoadCmpBB->addSuccessor(StoreBB);
 
   // .Lstore:
-  //     strex rStatus, rNew, [rAddr]
-  //     cmp rStatus, #0
+  //     strex rTempReg, rNew, [rAddr]
+  //     cmp rTempReg, #0
   //     bne .Lloadcmp
-  MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg)
+  MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), TempReg)
     .addReg(NewReg)
     .addReg(AddrReg);
   if (StrexOp == ARM::t2STREX)
@@ -848,7 +833,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
 
   unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
   BuildMI(StoreBB, DL, TII->get(CMPri))
-      .addReg(StatusReg, getKillRegState(StatusDead))
+      .addReg(TempReg, RegState::Kill)
       .addImm(0)
       .add(predOps(ARMCC::AL));
   BuildMI(StoreBB, DL, TII->get(Bcc))
@@ -904,8 +889,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
   MachineOperand &Dest = MI.getOperand(0);
-  unsigned StatusReg = MI.getOperand(1).getReg();
-  bool StatusDead = MI.getOperand(1).isDead();
+  unsigned TempReg = MI.getOperand(1).getReg();
   // Duplicating undef operands into 2 instructions does not guarantee the same
   // value on both; However undef should be replaced by xzr anyway.
   assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
@@ -931,7 +915,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   // .Lloadcmp:
   //     ldrexd rDestLo, rDestHi, [rAddr]
   //     cmp rDestLo, rDesiredLo
-  //     sbcs rStatus<dead>, rDestHi, rDesiredHi
+  //     sbcs rTempReg<dead>, rDestHi, rDesiredHi
   //     bne .Ldone
   unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD;
   MachineInstrBuilder MIB;
@@ -959,17 +943,17 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   LoadCmpBB->addSuccessor(StoreBB);
 
   // .Lstore:
-  //     strexd rStatus, rNewLo, rNewHi, [rAddr]
-  //     cmp rStatus, #0
+  //     strexd rTempReg, rNewLo, rNewHi, [rAddr]
+  //     cmp rTempReg, #0
   //     bne .Lloadcmp
   unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
-  MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg);
+  MIB = BuildMI(StoreBB, DL, TII->get(STREXD), TempReg);
   addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
   MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
 
   unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
   BuildMI(StoreBB, DL, TII->get(CMPri))
-      .addReg(StatusReg, getKillRegState(StatusDead))
+      .addReg(TempReg, RegState::Kill)
       .addImm(0)
       .add(predOps(ARMCC::AL));
   BuildMI(StoreBB, DL, TII->get(Bcc))
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index d06b7d0896f1..7206083a7079 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -6053,21 +6053,21 @@ def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
 // significantly more naive than the standard expansion: we conservatively
 // assume seq_cst, strong cmpxchg and omit clrex on failure.
 
-let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
+let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
     mayLoad = 1, mayStore = 1 in {
-def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
                             (ins GPR:$addr, GPR:$desired, GPR:$new),
                             NoItinerary, []>, Sched<[]>;
 
-def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
                              (ins GPR:$addr, GPR:$desired, GPR:$new),
                              NoItinerary, []>, Sched<[]>;
 
-def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
                              (ins GPR:$addr, GPR:$desired, GPR:$new),
                              NoItinerary, []>, Sched<[]>;
 
-def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status),
+def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$temp),
                              (ins GPR:$addr, GPRPair:$desired, GPRPair:$new),
                              NoItinerary, []>, Sched<[]>;
 }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7563bffd8f87..1e73122cdc38 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -419,6 +419,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT, VT, Custom);
     setOperationAction(ISD::SETCC,  VT, Custom);
   }
+
+  // Custom action for SELECT MMX and expand action for SELECT_CC MMX
+  setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
+
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
@@ -1383,7 +1388,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // (result) is 256-bit but the source is 512-bit wide.
     // 128-bit was made Custom under AVX1.
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
-                     MVT::v8f32, MVT::v4f64 })
+                     MVT::v8f32, MVT::v4f64, MVT::v1i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
                      MVT::v16i1, MVT::v32i1, MVT::v64i1 })
@@ -14570,6 +14575,21 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   MVT ResVT = Op.getSimpleValueType();
 
+  // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond
+  // would result with: v1i1 = extract_subvector(vXi1, idx).
+  // Lower these into extract_vector_elt which is already selectable.
+  if (ResVT == MVT::v1i1) {
+    assert(Subtarget.hasAVX512() &&
+           "Boolean EXTRACT_SUBVECTOR requires AVX512");
+
+    MVT EltVT = ResVT.getVectorElementType();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    MVT LegalVT =
+        (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT();
+    SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx);
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res);
+  }
+
   assert((In.getSimpleValueType().is256BitVector() ||
           In.getSimpleValueType().is512BitVector()) &&
          "Can only extract from 256-bit or 512-bit vectors");
@@ -20651,8 +20671,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   }
   // ADC/ADCX/SBB
   case ADX: {
-    SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
-    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
+    SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
                                 DAG.getConstant(-1, dl, MVT::i8));
     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
@@ -30663,6 +30683,14 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       return SDValue(N, 0);
   }
 
+  // Custom action for SELECT MMX
+  if (VT == MVT::x86mmx) {
+    LHS = DAG.getBitcast(MVT::i64, LHS);
+    RHS = DAG.getBitcast(MVT::i64, RHS);
+    SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
+    return DAG.getBitcast(VT, newSelect);
+  }
+
   return SDValue();
 }
 
@@ -33358,7 +33386,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                   Ld->getPointerInfo(), Ld->getAlignment(),
                                   Ld->getMemOperand()->getFlags());
-      SDValue NewChain = NewLd.getValue(1);
+      // Make sure new load is placed in same chain order.
+      SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
       if (TokenFactorIndex >= 0) {
         Ops.push_back(NewChain);
         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
@@ -33379,11 +33408,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                                Ld->getPointerInfo().getWithOffset(4),
                                MinAlign(Ld->getAlignment(), 4),
                                Ld->getMemOperand()->getFlags());
+    // Make sure new loads are placed in same chain order.
+    SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
+    NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
 
-    SDValue NewChain = LoLd.getValue(1);
     if (TokenFactorIndex >= 0) {
-      Ops.push_back(LoLd);
-      Ops.push_back(HiLd);
+      Ops.push_back(NewChain);
       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
     }
 
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 705d0f7a5cf7..0e654a380e7c 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -978,6 +978,44 @@ multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
                          (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX;
 }
 
+multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, 
+                                    X86VectorVTInfo _, SDPatternOperator OpNode,
+                                    RegisterClass SrcRC, SubRegIndex Subreg> {
+  let ExeDomain = _.ExeDomain in
+  defm r : AVX512_maskable_custom<opc, MRMSrcReg,
+                        (outs _.RC:$dst), (ins GR32:$src),
+                        !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
+                        !con((ins _.KRCWM:$mask), (ins GR32:$src)),
+                        "vpbroadcast"##_.Suffix, "$src", "$src", [], [], [],
+                        "$src0 = $dst">, T8PD, EVEX;
+
+  def : Pat <(_.VT (OpNode SrcRC:$src)),
+             (!cast<Instruction>(Name#r)
+              (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
+
+  def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.RC:$src0),
+             (!cast<Instruction>(Name#rk) _.RC:$src0, _.KRCWM:$mask,
+              (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
+
+  def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.ImmAllZerosV),
+             (!cast<Instruction>(Name#rkz) _.KRCWM:$mask,
+              (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
+}
+
+multiclass avx512_int_broadcastbw_reg_vl<bits<8> opc, string Name,
+                      AVX512VLVectorVTInfo _, SDPatternOperator OpNode,
+                      RegisterClass SrcRC, SubRegIndex Subreg, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : avx512_int_broadcastbw_reg<opc, Name#Z, _.info512, OpNode, SrcRC, 
+              Subreg>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_int_broadcastbw_reg<opc, Name#Z256, _.info256, OpNode,
+              SrcRC, Subreg>, EVEX_V256;
+    defm Z128 : avx512_int_broadcastbw_reg<opc, Name#Z128, _.info128, OpNode,
+              SrcRC, Subreg>, EVEX_V128;
+  }
+}
+
 multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
                                        SDPatternOperator OpNode,
                                        RegisterClass SrcRC, Predicate prd> {
@@ -989,18 +1027,11 @@ multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
   }
 }
 
-let isCodeGenOnly = 1 in {
-defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
-                                                 X86VBroadcast, GR8, HasBWI>;
-defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
-                                                 X86VBroadcast, GR16, HasBWI>;
-}
-let isAsmParserOnly = 1 in {
-  defm VPBROADCASTBr_Alt : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
-                                                       null_frag, GR32, HasBWI>;
-  defm VPBROADCASTWr_Alt : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
-                                                       null_frag, GR32, HasBWI>;
-}
+defm VPBROADCASTBr : avx512_int_broadcastbw_reg_vl<0x7A, "VPBROADCASTBr",
+                       avx512vl_i8_info, X86VBroadcast, GR8, sub_8bit, HasBWI>;
+defm VPBROADCASTWr : avx512_int_broadcastbw_reg_vl<0x7B, "VPBROADCASTWr",
+                       avx512vl_i16_info, X86VBroadcast, GR16, sub_16bit,
+                       HasBWI>;
 defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
                                                  X86VBroadcast, GR32, HasAVX512>;
 defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
diff --git a/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
index a7de79306074..fc15dc1e6032 100644
--- a/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
+++ b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -60,11 +60,13 @@ std::vector<std::unique_ptr<MemoryBuffer>> OwningMBs;
 
 // Opens a file. Path has to be resolved already.
 // Newly created memory buffers are owned by this driver.
-MemoryBufferRef openFile(StringRef Path) {
+Optional<MemoryBufferRef> openFile(StringRef Path) {
   ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MB = MemoryBuffer::getFile(Path);
 
-  if (std::error_code EC = MB.getError())
+  if (std::error_code EC = MB.getError()) {
     llvm::errs() << "fail openFile: " << EC.message() << "\n";
+    return None;
+  }
 
   MemoryBufferRef MBRef = MB.get()->getMemBufferRef();
   OwningMBs.push_back(std::move(MB.get())); // take ownership
@@ -114,11 +116,16 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
   for (auto *Arg : Args.filtered(OPT_UNKNOWN))
     llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";
 
-  MemoryBufferRef MB;
-  if (auto *Arg = Args.getLastArg(OPT_d))
-    MB = openFile(Arg->getValue());
+  if (!Args.hasArg(OPT_d)) {
+    llvm::errs() << "no definition file specified\n";
+    return 1;
+  }
+
+  Optional<MemoryBufferRef> MB = openFile(Args.getLastArg(OPT_d)->getValue());
+  if (!MB)
+    return 1;
 
-  if (!MB.getBufferSize()) {
+  if (!MB->getBufferSize()) {
     llvm::errs() << "definition file empty\n";
     return 1;
   }
@@ -133,7 +140,7 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
   }
 
   Expected<COFFModuleDefinition> Def =
-      parseCOFFModuleDefinition(MB, Machine, true);
+      parseCOFFModuleDefinition(*MB, Machine, true);
 
   if (!Def) {
     llvm::errs() << "error parsing definition\n"
@@ -154,7 +161,7 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
   if (Path.empty())
     Path = getImplibPath(Def->OutputFile);
 
-  if (writeImportLibrary(Def->OutputFile, Path, Def->Exports, Machine))
+  if (writeImportLibrary(Def->OutputFile, Path, Def->Exports, Machine, true))
     return 1;
   return 0;
 }
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index a33490f6e4ac..ddc975cbed1a 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1470,6 +1470,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
         }
 
         i = CS.arg_begin();
+        const unsigned ShadowArgStart = Args.size();
         for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
           Args.push_back(DFSF.getShadow(*i));
 
@@ -1505,6 +1506,15 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
         CustomCI->setCallingConv(CI->getCallingConv());
         CustomCI->setAttributes(CI->getAttributes());
 
+        // Update the parameter attributes of the custom call instruction to
+        // zero extend the shadow parameters. This is required for targets
+        // which consider ShadowTy an illegal type.
+        for (unsigned n = 0; n < FT->getNumParams(); n++) {
+          const unsigned ArgNo = ShadowArgStart + n;
+          if (CustomCI->getArgOperand(ArgNo)->getType() == DFSF.DFS.ShadowTy)
+            CustomCI->addParamAttr(ArgNo, Attribute::ZExt);
+        }
+
         if (!FT->getReturnType()->isVoidTy()) {
           LoadInst *LabelLoad = IRB.CreateLoad(DFSF.LabelReturnAlloca);
           DFSF.setShadow(CustomCI, LabelLoad);
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
index 61e8700f1cd6..2e5618686ec2 100644
--- a/lib/Transforms/Scalar/BDCE.cpp
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/DemandedBits.h"
@@ -35,6 +36,46 @@ using namespace llvm;
 STATISTIC(NumRemoved, "Number of instructions removed (unused)");
 STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
 
+/// If an instruction is trivialized (dead), then the chain of users of that
+/// instruction may need to be cleared of assumptions that can no longer be
+/// guaranteed correct.
+static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
+  assert(I->getType()->isIntegerTy() && "Trivializing a non-integer value?");
+
+  // Initialize the worklist with eligible direct users.
+  SmallVector<Instruction *, 16> WorkList;
+  for (User *JU : I->users()) {
+    // If all bits of a user are demanded, then we know that nothing below that
+    // in the def-use chain needs to be changed.
+    auto *J = dyn_cast<Instruction>(JU);
+    if (J && !DB.getDemandedBits(J).isAllOnesValue())
+      WorkList.push_back(J);
+  }
+
+  // DFS through subsequent users while tracking visits to avoid cycles.
+  SmallPtrSet<Instruction *, 16> Visited;
+  while (!WorkList.empty()) {
+    Instruction *J = WorkList.pop_back_val();
+
+    // NSW, NUW, and exact are based on operands that might have changed.
+    J->dropPoisonGeneratingFlags();
+
+    // We do not have to worry about llvm.assume or range metadata:
+    // 1. llvm.assume demands its operand, so trivializing can't change it.
+    // 2. range metadata only applies to memory accesses which demand all bits.
+
+    Visited.insert(J);
+
+    for (User *KU : J->users()) {
+      // If all bits of a user are demanded, then we know that nothing below
+      // that in the def-use chain needs to be changed.
+      auto *K = dyn_cast<Instruction>(KU);
+      if (K && !Visited.count(K) && !DB.getDemandedBits(K).isAllOnesValue())
+        WorkList.push_back(K);
+    }
+  }
+}
+
 static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
   SmallVector<Instruction*, 128> Worklist;
   bool Changed = false;
@@ -51,6 +92,9 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
       // replacing all uses with something else. Then, if they don't need to
       // remain live (because they have side effects, etc.) we can remove them.
       DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+
+      clearAssumptionsOfUsers(&I, DB);
+
       // FIXME: In theory we could substitute undef here instead of zero.
       // This should be reconsidered once we settle on the semantics of
       // undef, poison, etc.
diff --git a/test/Analysis/ScalarEvolution/max-addrec-size.ll b/test/Analysis/ScalarEvolution/max-addrec-size.ll
new file mode 100644
index 000000000000..aad0ddda37bc
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/max-addrec-size.ll
@@ -0,0 +1,33 @@
+; RUN: opt -analyze -scalar-evolution -scalar-evolution-max-add-rec-size=3 < %s | FileCheck %s
+
+; Show that we are able to avoid creation of huge SCEVs by capping the max
+; AddRec size.
+define i32 @test_01(i32 %a, i32 %b) {
+
+; CHECK-LABEL: Classifying expressions for: @test_01
+; CHECK-NEXT:    %iv = phi i32 [ %a, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    -->  {%a,+,%b}<%loop> U: full-set S: full-set
+; CHECK-NEXT:    %iv.next = add i32 %iv, %b
+; CHECK-NEXT:    -->  {(%a + %b),+,%b}<%loop> U: full-set S: full-set
+; CHECK-NEXT:    %x1 = mul i32 %iv, %iv.next
+; CHECK-NEXT:    -->  {((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop> U: full-set S: full-set
+; CHECK-NEXT:    %x2 = mul i32 %x1, %x1
+; CHECK-NEXT:    -->  ({((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop> * {((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop>) U: full-set S: full-set
+; CHECK-NEXT:    %x3 = mul i32 %x2, %x1
+; CHECK-NEXT:    -->  ({((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop> * {((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop> * {((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop>) U: full-set S: full-set
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %a, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, %b
+  %cond = icmp slt i32 %iv.next, 1000
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  %x1 = mul i32 %iv, %iv.next
+  %x2 = mul i32 %x1, %x1
+  %x3 = mul i32 %x2, %x1
+  ret i32 %x3
+}
diff --git a/test/CodeGen/AArch64/arm64-ldst-unscaled-pre-post.mir b/test/CodeGen/AArch64/arm64-ldst-unscaled-pre-post.mir
new file mode 100644
index 000000000000..dacaf4966d07
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldst-unscaled-pre-post.mir
@@ -0,0 +1,115 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass aarch64-ldst-opt  -verify-machineinstrs  -o - %s | FileCheck %s
+---
+# CHECK-LABEL: name: test_LDURSi_post
+# CHECK: LDRSpost %x0, -4
+name: test_LDURSi_post
+body: |
+  bb.0.entry:
+    liveins: %x0
+
+    %s0 = LDURSi %x0, 0
+    %x0 = SUBXri %x0, 4, 0
+    RET_ReallyLR implicit %x0
+...
+# CHECK-LABEL: name: test_LDURDi_post
+# CHECK: LDRDpost %x0, -4
+name: test_LDURDi_post
+body: |
+  bb.0.entry:
+    liveins: %x0
+
+    %d0 = LDURDi %x0, 0
+    %x0 = SUBXri %x0, 4, 0
+    RET_ReallyLR implicit %x0
+...
+# CHECK-LABEL: name: test_LDURQi_post
+# CHECK: LDRQpost %x0, -4
+name: test_LDURQi_post
+body: |
+  bb.0.entry:
+    liveins: %x0
+
+    %q0 = LDURQi  %x0, 0
+    %x0 = SUBXri %x0, 4, 0
+    RET_ReallyLR implicit %x0
+...
+# CHECK-LABEL: name: test_LDURWi_post
+# CHECK: LDRWpost %x0, -4
+name: test_LDURWi_post
+body: |
+  bb.0.entry:
+    liveins: %x0
+
+    %w1 = LDURWi %x0, 0
+    %x0 = SUBXri %x0, 4, 0
+    RET_ReallyLR implicit %x0
+...
+# CHECK-LABEL: name: test_LDURXi_post
+# CHECK: %x1 = LDRXpost %x0, -4
+name: test_LDURXi_post
+body: |
+  bb.0.entry:
+    liveins: %x0
+
+    %x1 = LDURXi %x0, 0
+    %x0 = SUBXri %x0, 4, 0
+    RET_ReallyLR implicit %x0
+...
+# CHECK-LABEL: name: test_STURSi_post
+# CHECK: STRSpost %s0, %x0, -4
+name: test_STURSi_post
+body: |
+  bb.0.entry:
+    liveins: %x0
+
+    %s0 = FMOVS0
+    STURSi %s0, %x0, 0
+    %x0 = SUBXri %x0, 4, 0
+    RET_ReallyLR implicit %x0
+...
+# CHECK-LABEL: name: test_STURDi_post
+# CHECK: STRDpost %d0, %x0, -4
+name: test_STURDi_post
+body: |
+  bb.0.entry:
+    liveins: %x0
+
+    %d0 = FMOVD0
+    STURDi %d0, %x0, 0
+    %x0 = SUBXri %x0, 4, 0
+    RET_ReallyLR implicit %x0
+...
+# CHECK-LABEL: name: test_STURQi_post
+# CHECK: STRQpost %q0, %x0, -4
+name: test_STURQi_post
+body: |
+  bb.0.entry:
+    liveins: %x0
+
+    %q0 = MOVIv4i32 0, 0
+    STURQi %q0, %x0, 0
+    %x0 = SUBXri %x0, 4, 0
+    RET_ReallyLR implicit %x0
+...
+# CHECK-LABEL: name: test_STURWi_post
+# CHECK: STRWpost %wzr, %x0, -4
+name: test_STURWi_post
+body: |
+  bb.0.entry:
+    liveins: %x0
+
+    STURWi %wzr, %x0, 0
+    %x0 = SUBXri %x0, 4, 0
+    RET_ReallyLR implicit %x0
+...
+# CHECK-LABEL: name: test_STURXi_post
+# CHECK: STRXpost %xzr, %x0, -4
+name: test_STURXi_post
+body: |
+  bb.0.entry:
+    liveins: %x0
+
+    STURXi %xzr, %x0, 0
+    %x0 = SUBXri %x0, 4, 0
+    RET_ReallyLR implicit %x0
+...
diff --git a/test/CodeGen/ARM/cmpxchg-O0.ll b/test/CodeGen/ARM/cmpxchg-O0.ll
index a3be72112c76..f8ad2bbbbe0e 100644
--- a/test/CodeGen/ARM/cmpxchg-O0.ll
+++ b/test/CodeGen/ARM/cmpxchg-O0.ll
@@ -10,11 +10,10 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
 ; CHECK:     dmb ish
 ; CHECK:     uxtb [[DESIRED:r[0-9]+]], [[DESIRED]]
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     mov{{s?}} [[STATUS:r[0-9]+]], #0
 ; CHECK:     ldrexb [[OLD:r[0-9]+]], [r0]
 ; CHECK:     cmp [[OLD]], [[DESIRED]]
 ; CHECK:     bne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     strexb [[STATUS]], r2, [r0]
+; CHECK:     strexb [[STATUS:r[0-9]+]], r2, [r0]
 ; CHECK:     cmp{{(\.w)?}} [[STATUS]], #0
 ; CHECK:     bne [[RETRY]]
 ; CHECK: [[DONE]]:
@@ -30,11 +29,10 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind
 ; CHECK:     dmb ish
 ; CHECK:     uxth [[DESIRED:r[0-9]+]], [[DESIRED]]
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     mov{{s?}} [[STATUS:r[0-9]+]], #0
 ; CHECK:     ldrexh [[OLD:r[0-9]+]], [r0]
 ; CHECK:     cmp [[OLD]], [[DESIRED]]
 ; CHECK:     bne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     strexh [[STATUS]], r2, [r0]
+; CHECK:     strexh [[STATUS:r[0-9]+]], r2, [r0]
 ; CHECK:     cmp{{(\.w)?}} [[STATUS]], #0
 ; CHECK:     bne [[RETRY]]
 ; CHECK: [[DONE]]:
@@ -50,11 +48,10 @@ define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind
 ; CHECK:     dmb ish
 ; CHECK-NOT:     uxt
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     mov{{s?}} [[STATUS:r[0-9]+]], #0
 ; CHECK:     ldrex [[OLD:r[0-9]+]], [r0]
 ; CHECK:     cmp [[OLD]], [[DESIRED]]
 ; CHECK:     bne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     strex [[STATUS]], r2, [r0]
+; CHECK:     strex [[STATUS:r[0-9]+]], r2, [r0]
 ; CHECK:     cmp{{(\.w)?}} [[STATUS]], #0
 ; CHECK:     bne [[RETRY]]
 ; CHECK: [[DONE]]:
diff --git a/test/CodeGen/ARM/virtregrewriter-subregliveness.mir b/test/CodeGen/ARM/virtregrewriter-subregliveness.mir
new file mode 100644
index 000000000000..83335a3ccffd
--- /dev/null
+++ b/test/CodeGen/ARM/virtregrewriter-subregliveness.mir
@@ -0,0 +1,84 @@
+# RUN: llc  -o -  -mtriple=thumbv7--windows-gnu -run-pass=greedy -run-pass=virtregrewriter %s | FileCheck %s
+--- |
+  target datalayout = "e-m:w-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv7--windows-gnu"
+  
+  define void @subregLiveThrough() { ret void }
+  define void @subregNotLiveThrough() { ret void }
+  define void @subregNotLiveThrough2() { ret void }
+
+...
+---
+# Check that we properly recognize that r1 is live through
+# the first subreg copy.
+# That will materialize as an implicit use of the big register
+# on that copy.
+# PR34107.
+#
+# CHECK-LABEL: name: subregLiveThrough
+name:            subregLiveThrough
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprpair }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; That copy is being coalesced so we should use a KILL
+    ; placeholder. If that's not a kill that means we probably
+    ; not coalescing %0 and %r0_r1 and thus we are not testing
+    ; the problematic code anymore.
+    ;
+    ; CHECK: %r0 = KILL %r0, implicit killed %r0_r1, implicit-def %r0_r1
+    ; CHECK-NEXT: %r1 = KILL %r1, implicit killed %r0_r1
+    undef %0.gsub_0 = COPY %r0
+    %0.gsub_1 = COPY %r1
+    tBX_RET 14, _, implicit %0
+  
+
+...
+
+---
+# Check that we properly recognize that r1 is *not* live through
+# the first subreg copy.
+# CHECK-LABEL: name: subregNotLiveThrough
+name:            subregNotLiveThrough
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprpair }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; r1 is not live through so check we are not implicitly using
+    ; the big register.
+    ; CHECK: %r0 = KILL %r0, implicit-def %r0_r1
+    ; CHECK-NEXT: tBX_RET
+    undef %0.gsub_0 = COPY %r0
+    tBX_RET 14, _, implicit %0
+  
+
+...
+
+---
+# Check that we properly recognize that r1 is *not* live through
+# the first subreg copy. It is defined by this copy, but is not
+# through.
+# CHECK-LABEL: name: subregNotLiveThrough2
+name:            subregNotLiveThrough2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprpair }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; r1 is not live through so check we are not implicitly using
+    ; the big register.
+    ; CHECK: %r0 = KILL %r0, implicit-def %r1, implicit-def %r0_r1
+    ; CHECK-NEXT: tBX_RET
+    undef %0.gsub_0 = COPY %r0, implicit-def %r1
+    tBX_RET 14, _, implicit %0
+  
+
+...
diff --git a/test/CodeGen/X86/adx-intrinsics.ll b/test/CodeGen/X86/adx-intrinsics.ll
index 0498177a9c12..819a5df14e63 100644
--- a/test/CodeGen/X86/adx-intrinsics.ll
+++ b/test/CodeGen/X86/adx-intrinsics.ll
@@ -75,3 +75,30 @@ define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
   ret i8 %ret;
 }
 
+; Try a version with loads. Previously we crashed on this.
+define i32 @load_crash(i64* nocapture readonly %a, i64* nocapture readonly %b, i64* %res)  {
+; CHECK-LABEL: load_crash
+; CHECK: addb
+; ADX: adcxq
+; CHECK: setb
+; CHECK: retq
+  %1 = load i64, i64* %a, align 8
+  %2 = load i64, i64* %b, align 8
+  %3 = bitcast i64* %res to i8*
+  %4 = tail call i8 @llvm.x86.addcarryx.u64(i8 0, i64 %1, i64 %2, i8* %3)
+  %conv = zext i8 %4 to i32
+  ret i32 %conv
+}
+
+; Try a really simple all zero input case, which also used to crash
+define void @allzeros() {
+; CHECK-LABEL: allzeros
+; CHECK: xorl
+; CHECK: addb
+; CHECK: sbbq
+; CHECK: andl
+; CHECK: retq
+entry:
+  %0 = tail call i8 @llvm.x86.addcarryx.u64(i8 0, i64 0, i64 0, i8* null)
+  ret void
+}
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 5472f057ef27..4abe3df9fc2a 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1921,9 +1921,9 @@ define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8>
 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpbroadcastb %dil, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpbroadcastb %dil, %zmm1 {%k1} {z}
-; AVX512BW-NEXT:    vpbroadcastb %dil, %zmm2
+; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm2
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -1934,9 +1934,9 @@ define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8>
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT:    vpbroadcastb %al, %zmm1 {%k1} {z}
-; AVX512F-32-NEXT:    vpbroadcastb %al, %zmm0 {%k1}
-; AVX512F-32-NEXT:    vpbroadcastb %al, %zmm2
+; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm1 {%k1} {z}
+; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm0 {%k1}
+; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm2
 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
@@ -1954,20 +1954,20 @@ define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i
 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpbroadcastw %di, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpbroadcastw %di, %zmm1 {%k1} {z}
-; AVX512BW-NEXT:    vpbroadcastw %di, %zmm2
+; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm2
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
 ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpbroadcastw %ax, %zmm0 {%k1}
-; AVX512F-32-NEXT:    vpbroadcastw %ax, %zmm1 {%k1} {z}
-; AVX512F-32-NEXT:    vpbroadcastw %ax, %zmm2
+; AVX512F-32-NEXT:    movw {{[0-9]+}}(%esp), %ax
+; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm1 {%k1} {z}
+; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm0 {%k1}
+; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm2
 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index c3ba6f106e6a..9ceb3e5931a6 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -2799,9 +2799,9 @@ define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastb %dil, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
-; CHECK-NEXT:    vpbroadcastb %dil, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf]
-; CHECK-NEXT:    vpbroadcastb %dil, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xd7]
+; CHECK-NEXT:    vpbroadcastb %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf]
+; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
+; CHECK-NEXT:    vpbroadcastb %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xd7]
 ; CHECK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
 ; CHECK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2819,9 +2819,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastb %dil, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf]
-; CHECK-NEXT:    vpbroadcastb %dil, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
-; CHECK-NEXT:    vpbroadcastb %dil, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xd7]
+; CHECK-NEXT:    vpbroadcastb %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf]
+; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
+; CHECK-NEXT:    vpbroadcastb %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xd7]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2839,9 +2839,9 @@ define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastw %di, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf]
-; CHECK-NEXT:    vpbroadcastw %di, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
-; CHECK-NEXT:    vpbroadcastw %di, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xd7]
+; CHECK-NEXT:    vpbroadcastw %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf]
+; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
+; CHECK-NEXT:    vpbroadcastw %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xd7]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2859,9 +2859,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastw %di, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf]
-; CHECK-NEXT:    vpbroadcastw %di, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
-; CHECK-NEXT:    vpbroadcastw %di, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xd7]
+; CHECK-NEXT:    vpbroadcastw %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf]
+; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
+; CHECK-NEXT:    vpbroadcastw %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xd7]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
diff --git a/test/CodeGen/X86/pr33349.ll b/test/CodeGen/X86/pr33349.ll
new file mode 100644
index 000000000000..db866db22481
--- /dev/null
+++ b/test/CodeGen/X86/pr33349.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mattr=+avx512f | FileCheck %s --check-prefix=KNL
+; RUN: llc < %s -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=SKX
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ define void @test(<4 x i1> %m, <4 x x86_fp80> %v, <4 x x86_fp80>*%p) local_unnamed_addr {
+; KNL-LABEL: test:
+; KNL:       # BB#0: # %bb
+; KNL-NEXT:    vpextrb $0, %xmm0, %eax
+; KNL-NEXT:    testb $1, %al
+; KNL-NEXT:    fld1
+; KNL-NEXT:    fldz
+; KNL-NEXT:    fld %st(0)
+; KNL-NEXT:    fcmovne %st(2), %st(0)
+; KNL-NEXT:    vpextrb $4, %xmm0, %eax
+; KNL-NEXT:    testb $1, %al
+; KNL-NEXT:    fld %st(1)
+; KNL-NEXT:    fcmovne %st(3), %st(0)
+; KNL-NEXT:    vpextrb $8, %xmm0, %eax
+; KNL-NEXT:    testb $1, %al
+; KNL-NEXT:    fld %st(2)
+; KNL-NEXT:    fcmovne %st(4), %st(0)
+; KNL-NEXT:    vpextrb $12, %xmm0, %eax
+; KNL-NEXT:    testb $1, %al
+; KNL-NEXT:    fxch %st(3)
+; KNL-NEXT:    fcmovne %st(4), %st(0)
+; KNL-NEXT:    fstp %st(4)
+; KNL-NEXT:    fxch %st(3)
+; KNL-NEXT:    fstpt 30(%rdi)
+; KNL-NEXT:    fxch %st(1)
+; KNL-NEXT:    fstpt 20(%rdi)
+; KNL-NEXT:    fxch %st(1)
+; KNL-NEXT:    fstpt 10(%rdi)
+; KNL-NEXT:    fstpt (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test:
+; SKX:       # BB#0: # %bb
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; SKX-NEXT:    kshiftrw $2, %k0, %k1
+; SKX-NEXT:    kshiftlw $15, %k1, %k2
+; SKX-NEXT:    kshiftrw $15, %k2, %k2
+; SKX-NEXT:    kshiftlw $15, %k2, %k2
+; SKX-NEXT:    kshiftrw $15, %k2, %k2
+; SKX-NEXT:    kmovd %k2, %eax
+; SKX-NEXT:    testb $1, %al
+; SKX-NEXT:    fld1
+; SKX-NEXT:    fldz
+; SKX-NEXT:    fld %st(0)
+; SKX-NEXT:    fcmovne %st(2), %st(0)
+; SKX-NEXT:    kshiftlw $14, %k1, %k1
+; SKX-NEXT:    kshiftrw $15, %k1, %k1
+; SKX-NEXT:    kshiftlw $15, %k1, %k1
+; SKX-NEXT:    kshiftrw $15, %k1, %k1
+; SKX-NEXT:    kmovd %k1, %eax
+; SKX-NEXT:    testb $1, %al
+; SKX-NEXT:    fld %st(1)
+; SKX-NEXT:    fcmovne %st(3), %st(0)
+; SKX-NEXT:    kshiftlw $15, %k0, %k1
+; SKX-NEXT:    kshiftrw $15, %k1, %k1
+; SKX-NEXT:    kshiftlw $15, %k1, %k1
+; SKX-NEXT:    kshiftrw $15, %k1, %k1
+; SKX-NEXT:    kmovd %k1, %eax
+; SKX-NEXT:    testb $1, %al
+; SKX-NEXT:    fld %st(2)
+; SKX-NEXT:    fcmovne %st(4), %st(0)
+; SKX-NEXT:    kshiftlw $14, %k0, %k0
+; SKX-NEXT:    kshiftrw $15, %k0, %k0
+; SKX-NEXT:    kshiftlw $15, %k0, %k0
+; SKX-NEXT:    kshiftrw $15, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    testb $1, %al
+; SKX-NEXT:    fxch %st(3)
+; SKX-NEXT:    fcmovne %st(4), %st(0)
+; SKX-NEXT:    fstp %st(4)
+; SKX-NEXT:    fxch %st(3)
+; SKX-NEXT:    fstpt 10(%rdi)
+; SKX-NEXT:    fxch %st(1)
+; SKX-NEXT:    fstpt (%rdi)
+; SKX-NEXT:    fxch %st(1)
+; SKX-NEXT:    fstpt 30(%rdi)
+; SKX-NEXT:    fstpt 20(%rdi)
+; SKX-NEXT:    retq
+ bb:
+   %tmp = select <4 x i1> %m, <4 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80             0xK3FFF8000000000000000>, <4 x x86_fp80> zeroinitializer
+   store <4 x x86_fp80> %tmp, <4 x x86_fp80>* %p, align 16
+   ret void
+ }
+
diff --git a/test/CodeGen/X86/pr34088.ll b/test/CodeGen/X86/pr34088.ll
new file mode 100644
index 000000000000..d3667e3884d4
--- /dev/null
+++ b/test/CodeGen/X86/pr34088.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=pentium4 | FileCheck %s
+
+%struct.Foo = type { i32, %struct.Bar }
+%struct.Bar = type { i32, %struct.Buffer, i32 }
+%struct.Buffer = type { i8*, i32 }
+
+; This test checks that the load of store %2 is not dropped.
+; 
+define i32 @pr34088() local_unnamed_addr {
+; CHECK-LABEL: pr34088:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:  .Lcfi1:
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:  .Lcfi2:
+; CHECK-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $32, %esp
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205]
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movaps %xmm0, (%esp)
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movaps %xmm1, (%esp)
+; CHECK-NEXT:    movl $-842150451, {{[0-9]+}}(%esp) # imm = 0xCDCDCDCD
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
+entry:
+  %foo = alloca %struct.Foo, align 4
+  %0 = bitcast %struct.Foo* %foo to i8*
+  call void @llvm.memset.p0i8.i32(i8* nonnull %0, i8 0, i32 20, i32 4, i1 false)
+  %buffer1 = getelementptr inbounds %struct.Foo, %struct.Foo* %foo, i32 0, i32 1, i32 1
+  %1 = bitcast %struct.Buffer* %buffer1 to i64*
+  %2 = load i64, i64* %1, align 4
+  call void @llvm.memset.p0i8.i32(i8* nonnull %0, i8 -51, i32 20, i32 4, i1 false)
+  store i64 %2, i64* %1, align 4
+  ret i32 0
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i32, i1)
diff --git a/test/CodeGen/X86/select-mmx.ll b/test/CodeGen/X86/select-mmx.ll
new file mode 100644
index 000000000000..9e6382faaa59
--- /dev/null
+++ b/test/CodeGen/X86/select-mmx.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+mmx < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+mmx < %s | FileCheck %s --check-prefix=I32
+
+
+; From source: clang -02
+;__m64 test47(int a)
+;{
+;    __m64 x = (a)? (__m64)(7): (__m64)(0);
+; return __builtin_ia32_psllw(x, x);
+;}
+
+define i64 @test47(i64 %arg)  {
+;
+; X64-LABEL: test47:
+; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    movl $7, %ecx
+; X64-NEXT:    cmoveq %rcx, %rax
+; X64-NEXT:    movd %rax, %mm0
+; X64-NEXT:    psllw %mm0, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
+;
+; I32-LABEL: test47:
+; I32:       # BB#0:
+; I32-NEXT:    pushl %ebp
+; I32-NEXT:  .Lcfi0:
+; I32-NEXT:    .cfi_def_cfa_offset 8
+; I32-NEXT:  .Lcfi1:
+; I32-NEXT:    .cfi_offset %ebp, -8
+; I32-NEXT:    movl %esp, %ebp
+; I32-NEXT:  .Lcfi2:
+; I32-NEXT:    .cfi_def_cfa_register %ebp
+; I32-NEXT:    andl $-8, %esp
+; I32-NEXT:    subl $16, %esp
+; I32-NEXT:    movl 8(%ebp), %eax
+; I32-NEXT:    orl 12(%ebp), %eax
+; I32-NEXT:    movl $7, %eax
+; I32-NEXT:    je .LBB0_2
+; I32-NEXT:  # BB#1:
+; I32-NEXT:    xorl %eax, %eax
+; I32-NEXT:  .LBB0_2:
+; I32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I32-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; I32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I32-NEXT:    psllw %mm0, %mm0
+; I32-NEXT:    movq %mm0, (%esp)
+; I32-NEXT:    movl (%esp), %eax
+; I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I32-NEXT:    movl %ebp, %esp
+; I32-NEXT:    popl %ebp
+; I32-NEXT:    retl
+  %cond = icmp eq i64 %arg, 0
+  %slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx)
+  %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
+  %retc = bitcast x86_mmx %psll to i64
+  ret i64 %retc
+}
+
+
+; From source: clang -O2
+;__m64 test49(int a, long long n, long long m)
+;{
+;    __m64 x = (a)? (__m64)(n): (__m64)(m);
+; return __builtin_ia32_psllw(x, x);
+;}
+
+define i64 @test49(i64 %arg, i64 %x, i64 %y) {
+;
+; X64-LABEL: test49:
+; X64:       # BB#0:
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    cmovneq %rdx, %rsi
+; X64-NEXT:    movd %rsi, %mm0
+; X64-NEXT:    psllw %mm0, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
+;
+; I32-LABEL: test49:
+; I32:       # BB#0:
+; I32-NEXT:    pushl %ebp
+; I32-NEXT:  .Lcfi3:
+; I32-NEXT:    .cfi_def_cfa_offset 8
+; I32-NEXT:  .Lcfi4:
+; I32-NEXT:    .cfi_offset %ebp, -8
+; I32-NEXT:    movl %esp, %ebp
+; I32-NEXT:  .Lcfi5:
+; I32-NEXT:    .cfi_def_cfa_register %ebp
+; I32-NEXT:    andl $-8, %esp
+; I32-NEXT:    subl $8, %esp
+; I32-NEXT:    movl 8(%ebp), %eax
+; I32-NEXT:    orl 12(%ebp), %eax
+; I32-NEXT:    je .LBB1_1
+; I32-NEXT:  # BB#2:
+; I32-NEXT:    leal 24(%ebp), %eax
+; I32-NEXT:    jmp .LBB1_3
+; I32-NEXT:  .LBB1_1:
+; I32-NEXT:    leal 16(%ebp), %eax
+; I32-NEXT:  .LBB1_3:
+; I32-NEXT:    movq (%eax), %mm0
+; I32-NEXT:    psllw %mm0, %mm0
+; I32-NEXT:    movq %mm0, (%esp)
+; I32-NEXT:    movl (%esp), %eax
+; I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I32-NEXT:    movl %ebp, %esp
+; I32-NEXT:    popl %ebp
+; I32-NEXT:    retl
+  %cond = icmp eq i64 %arg, 0
+  %xmmx = bitcast i64 %x to x86_mmx
+  %ymmx = bitcast i64 %y to x86_mmx
+  %slct = select i1 %cond, x86_mmx %xmmx, x86_mmx %ymmx
+  %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
+  %retc = bitcast x86_mmx %psll to i64
+  ret i64 %retc
+}
+
+declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx)
+
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index abba0ff87ace..9f1ed021992d 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1643,7 +1643,7 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movsbl (%rdi), %eax
 ; AVX512VL-NEXT:    shrl $8, %eax
-; AVX512VL-NEXT:    vpbroadcastb %al, %xmm0
+; AVX512VL-NEXT:    vpbroadcastb %eax, %xmm0
 ; AVX512VL-NEXT:    retq
   %tmp = load i8, i8* %ptr, align 1
   %tmp1 = sext i8 %tmp to i32
@@ -1696,7 +1696,7 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movsbl (%rdi), %eax
 ; AVX512VL-NEXT:    shrl $16, %eax
-; AVX512VL-NEXT:    vpbroadcastb %al, %xmm0
+; AVX512VL-NEXT:    vpbroadcastb %eax, %xmm0
 ; AVX512VL-NEXT:    retq
   %tmp = load i8, i8* %ptr, align 1
   %tmp1 = sext i8 %tmp to i32
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index c03b9d1472c1..1cf8453fc6ad 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -2274,7 +2274,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
 ; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movswl (%rdi), %eax
-; AVX512VL-NEXT:    vpbroadcastw %ax, %xmm0
+; AVX512VL-NEXT:    vpbroadcastw %eax, %xmm0
 ; AVX512VL-NEXT:    retq
   %tmp = load i16, i16* %ptr, align 2
   %tmp1 = sext i16 %tmp to i32
@@ -2390,7 +2390,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movswl (%rdi), %eax
 ; AVX512VL-NEXT:    shrl $16, %eax
-; AVX512VL-NEXT:    vpbroadcastw %ax, %xmm0
+; AVX512VL-NEXT:    vpbroadcastw %eax, %xmm0
 ; AVX512VL-NEXT:    retq
   %tmp = load i16, i16* %ptr, align 2
   %tmp1 = sext i16 %tmp to i32
@@ -2443,7 +2443,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movswl (%rdi), %eax
 ; AVX512VL-NEXT:    shrl $16, %eax
-; AVX512VL-NEXT:    vpbroadcastw %ax, %xmm0
+; AVX512VL-NEXT:    vpbroadcastw %eax, %xmm0
 ; AVX512VL-NEXT:    retq
   %tmp = load i16, i16* %ptr, align 2
   %tmp1 = sext i16 %tmp to i32
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 6f5d916f2294..ba7c0894b932 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -4069,7 +4069,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
 ; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movswl (%rdi), %eax
-; AVX512VL-NEXT:    vpbroadcastw %ax, %ymm0
+; AVX512VL-NEXT:    vpbroadcastw %eax, %ymm0
 ; AVX512VL-NEXT:    retq
   %tmp = load i16, i16* %ptr, align 2
   %tmp1 = sext i16 %tmp to i32
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 05a797cb6f8e..d51b69415b93 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2431,7 +2431,7 @@ define <32 x i8> @insert_dup_elt1_mem_v32i8_sext_i8(i8* %ptr) {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movsbl (%rdi), %eax
 ; AVX512VL-NEXT:    shrl $8, %eax
-; AVX512VL-NEXT:    vpbroadcastb %al, %ymm0
+; AVX512VL-NEXT:    vpbroadcastb %eax, %ymm0
 ; AVX512VL-NEXT:    retq
   %tmp = load i8, i8* %ptr, align 1
   %tmp1 = sext i8 %tmp to i32
diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 7a5c992bb829..b8fc27ba5515 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -228,7 +228,7 @@ define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
 ; SKX-LABEL: insert_dup_mem_v32i16_i32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movl (%rdi), %eax
-; SKX-NEXT:    vpbroadcastw %ax, %zmm0
+; SKX-NEXT:    vpbroadcastw %eax, %zmm0
 ; SKX-NEXT:    retq
   %tmp = load i32, i32* %ptr, align 4
   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
@@ -249,7 +249,7 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
 ; SKX-LABEL: insert_dup_mem_v32i16_sext_i16:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movswl (%rdi), %eax
-; SKX-NEXT:    vpbroadcastw %ax, %zmm0
+; SKX-NEXT:    vpbroadcastw %eax, %zmm0
 ; SKX-NEXT:    retq
   %tmp = load i16, i16* %ptr, align 2
   %tmp1 = sext i16 %tmp to i32
@@ -269,7 +269,7 @@ define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 {
 ; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movzwl 2(%rdi), %eax
-; SKX-NEXT:    vpbroadcastw %ax, %zmm0
+; SKX-NEXT:    vpbroadcastw %eax, %zmm0
 ; SKX-NEXT:    retq
   %tmp = load i32, i32* %ptr, align 4
   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
@@ -288,7 +288,7 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 {
 ; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movzwl 2(%rdi), %eax
-; SKX-NEXT:    vpbroadcastw %ax, %zmm0
+; SKX-NEXT:    vpbroadcastw %eax, %zmm0
 ; SKX-NEXT:    retq
   %tmp = load i32, i32* %ptr, align 4
   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll
index f4650ec741a7..9dca3191e06b 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -332,7 +332,7 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) {
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    movsbl (%rdi), %eax
 ; AVX512BW-NEXT:    shrl $8, %eax
-; AVX512BW-NEXT:    vpbroadcastb %al, %zmm0
+; AVX512BW-NEXT:    vpbroadcastb %eax, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
@@ -348,7 +348,7 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) {
 ; AVX512VBMI:       # BB#0:
 ; AVX512VBMI-NEXT:    movsbl (%rdi), %eax
 ; AVX512VBMI-NEXT:    shrl $8, %eax
-; AVX512VBMI-NEXT:    vpbroadcastb %al, %zmm0
+; AVX512VBMI-NEXT:    vpbroadcastb %eax, %zmm0
 ; AVX512VBMI-NEXT:    retq
   %tmp = load i8, i8* %ptr, align 1
   %tmp1 = sext i8 %tmp to i32
diff --git a/test/Instrumentation/DataFlowSanitizer/Inputs/shadow-args-abilist.txt b/test/Instrumentation/DataFlowSanitizer/Inputs/shadow-args-abilist.txt
new file mode 100644
index 000000000000..723cbc9086da
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/Inputs/shadow-args-abilist.txt
@@ -0,0 +1,8 @@
+fun:dfsan_get_label=uninstrumented
+fun:dfsan_get_label=custom
+
+fun:k2=uninstrumented
+fun:k2=custom
+
+fun:k4=uninstrumented
+fun:k4=custom
diff --git a/test/Instrumentation/DataFlowSanitizer/abilist.ll b/test/Instrumentation/DataFlowSanitizer/abilist.ll
index 8b30875a03fa..e33237ffe19d 100644
--- a/test/Instrumentation/DataFlowSanitizer/abilist.ll
+++ b/test/Instrumentation/DataFlowSanitizer/abilist.ll
@@ -47,13 +47,13 @@ define void @f(i32 %x) {
   ; CHECK: %[[LABELVA1:.*]] = alloca [2 x i16]
   ; CHECK: %[[LABELRETURN:.*]] = alloca i16
 
-  ; CHECK: call void @__dfsw_custom1(i32 1, i32 2, i16 0, i16 0)
+  ; CHECK: call void @__dfsw_custom1(i32 1, i32 2, i16 zeroext 0, i16 zeroext 0)
   call void @custom1(i32 1, i32 2)
 
-  ; CHECK: call i32 @__dfsw_custom2(i32 1, i32 2, i16 0, i16 0, i16* %[[LABELRETURN]])
+  ; CHECK: call i32 @__dfsw_custom2(i32 1, i32 2, i16 zeroext 0, i16 zeroext 0, i16* %[[LABELRETURN]])
   call i32 @custom2(i32 1, i32 2)
 
-  ; CHECK: call void @__dfsw_customcb({{.*}} @"dfst0$customcb", i8* bitcast ({{.*}} @"dfs$cb" to i8*), i16 0)
+  ; CHECK: call void @__dfsw_customcb({{.*}} @"dfst0$customcb", i8* bitcast ({{.*}} @"dfs$cb" to i8*), i16 zeroext 0)
   call void @customcb(i32 (i32)* @cb)
 
   ; CHECK: %[[LABELVA1_0:.*]] = getelementptr inbounds [2 x i16], [2 x i16]* %[[LABELVA1]], i32 0, i32 0
@@ -61,12 +61,12 @@ define void @f(i32 %x) {
   ; CHECK: %[[LABELVA1_1:.*]] = getelementptr inbounds [2 x i16], [2 x i16]* %[[LABELVA1]], i32 0, i32 1
   ; CHECK: store i16 %{{.*}}, i16* %[[LABELVA1_1]]
   ; CHECK: %[[LABELVA1_0A:.*]] = getelementptr inbounds [2 x i16], [2 x i16]* %[[LABELVA1]], i32 0, i32 0
-  ; CHECK: call void (i32, i16, i16*, ...) @__dfsw_custom3(i32 1, i16 0, i16* %[[LABELVA1_0A]], i32 2, i32 %{{.*}})
+  ; CHECK: call void (i32, i16, i16*, ...) @__dfsw_custom3(i32 1, i16 zeroext 0, i16* %[[LABELVA1_0A]], i32 2, i32 %{{.*}})
   call void (i32, ...) @custom3(i32 1, i32 2, i32 %x)
 
   ; CHECK: %[[LABELVA2_0:.*]] = getelementptr inbounds [2 x i16], [2 x i16]* %[[LABELVA2]], i32 0, i32 0
   ; CHECK: %[[LABELVA2_0A:.*]] = getelementptr inbounds [2 x i16], [2 x i16]* %[[LABELVA2]], i32 0, i32 0
-  ; CHECK: call i32 (i32, i16, i16*, i16*, ...) @__dfsw_custom4(i32 1, i16 0, i16* %[[LABELVA2_0A]], i16* %[[LABELRETURN]], i32 2, i32 3)
+  ; CHECK: call i32 (i32, i16, i16*, i16*, ...) @__dfsw_custom4(i32 1, i16 zeroext 0, i16* %[[LABELVA2_0A]], i16* %[[LABELRETURN]], i32 2, i32 3)
   call i32 (i32, ...) @custom4(i32 1, i32 2, i32 3)
 
   ret void
diff --git a/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll b/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
new file mode 100644
index 000000000000..0ffbf1970e7f
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
@@ -0,0 +1,54 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt  | FileCheck %s
+
+; REQUIRES: x86-registered-target
+
+; Test that the custom abi marks shadow parameters as zero extended.
+
+define i32 @m() {
+entry:
+  %call = call zeroext i16 @dfsan_get_label(i64 signext 56)
+  %conv = zext i16 %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: @"dfs$m"
+; CHECK: %{{.*}} = call zeroext i16 @__dfsw_dfsan_get_label(i64 signext 56, i16 zeroext 0, i16* %{{.*}})
+
+define i32 @k() {
+entry:
+  %call = call zeroext i16 @k2(i64 signext 56, i64 signext 67)
+  %conv = zext i16 %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: @"dfs$k"
+; CHECK: %{{.*}} = call zeroext i16 @__dfsw_k2(i64 signext 56, i64 signext 67, i16 zeroext {{.*}}, i16 zeroext {{.*}}, i16* %{{.*}})
+
+define i32 @k3() {
+entry:
+  %call = call zeroext i16 @k4(i64 signext 56, i64 signext 67, i64 signext 78, i64 signext 89)
+  %conv = zext i16 %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: @"dfs$k3"
+; CHECK: %{{.*}} = call zeroext i16 @__dfsw_k4(i64 signext 56, i64 signext 67, i64 signext 78, i64 signext 89, i16 zeroext {{.*}}, i16 zeroext {{.*}}, i16 zeroext {{.*}}, i16 zeroext {{.*}}, i16* %{{.*}})
+
+declare zeroext i16 @dfsan_get_label(i64 signext)
+
+; CHECK-LABEL: @"dfsw$dfsan_get_label"
+; CHECK: %{{.*}} = call i16 @__dfsw_dfsan_get_label(i64 %0, i16 zeroext %1, i16* %{{.*}})
+
+declare zeroext i16 @k2(i64 signext, i64 signext)
+; CHECK-LABEL: @"dfsw$k2"
+; CHECK: %{{.*}} = call i16 @__dfsw_k2(i64 %{{.*}}, i64 %{{.*}}, i16 zeroext %{{.*}}, i16 zeroext %{{.*}}, i16* %{{.*}})
+
+declare zeroext i16 @k4(i64 signext, i64 signext, i64 signext, i64 signext)
+
+; CHECK-LABEL: @"dfsw$k4"
+; CHECK: %{{.*}} = call i16 @__dfsw_k4(i64 %{{.*}}, i64 %{{.*}}, i64  %{{.*}}, i64 %{{.*}}, i16 zeroext %{{.*}}, i16 zeroext %{{.*}}, i16 zeroext %{{.*}}, i16 zeroext %{{.*}}, i16* %{{.*}})
+
+
+; CHECK: declare zeroext i16 @__dfsw_dfsan_get_label(i64 signext, i16, i16*)
+; CHECK: declare zeroext i16 @__dfsw_k2(i64 signext, i64 signext, i16, i16, i16*)
+; CHECK: declare zeroext i16 @__dfsw_k4(i64 signext, i64 signext, i64 signext, i64 signext, i16, i16, i16, i16, i16*)
diff --git a/test/Transforms/BDCE/invalidate-assumptions.ll b/test/Transforms/BDCE/invalidate-assumptions.ll
new file mode 100644
index 000000000000..d165d74be86d
--- /dev/null
+++ b/test/Transforms/BDCE/invalidate-assumptions.ll
@@ -0,0 +1,100 @@
+; RUN: opt -bdce %s -S | FileCheck %s
+
+; The 'nuw' on the subtract allows us to deduce that %setbit is not demanded.
+; But if we change that value to '0', then the 'nuw' is no longer valid. If we don't
+; remove the 'nuw', another pass (-instcombine) may make a transform based on an
+; that incorrect assumption and we can miscompile:
+; https://bugs.llvm.org/show_bug.cgi?id=33695
+
+define i1 @PR33695(i1 %b, i8 %x) {
+; CHECK-LABEL: @PR33695(
+; CHECK-NEXT:    [[SETBIT:%.*]] = or i8 %x, 64
+; CHECK-NEXT:    [[LITTLE_NUMBER:%.*]] = zext i1 %b to i8
+; CHECK-NEXT:    [[BIG_NUMBER:%.*]] = shl i8 0, 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[BIG_NUMBER]], [[LITTLE_NUMBER]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[SUB]] to i1
+; CHECK-NEXT:    ret i1 [[TRUNC]]
+;
+  %setbit = or i8 %x, 64
+  %little_number = zext i1 %b to i8
+  %big_number = shl i8 %setbit, 1
+  %sub = sub nuw i8 %big_number, %little_number
+  %trunc = trunc i8 %sub to i1
+  ret i1 %trunc
+}
+
+; Similar to above, but now with more no-wrap.
+; https://bugs.llvm.org/show_bug.cgi?id=34037
+
+define i64 @PR34037(i64 %m, i32 %r, i64 %j, i1 %b, i32 %k, i64 %p) {
+; CHECK-LABEL: @PR34037(
+; CHECK-NEXT:    [[CONV:%.*]] = zext i32 %r to i64
+; CHECK-NEXT:    [[AND:%.*]] = and i64 %m, 0
+; CHECK-NEXT:    [[NEG:%.*]] = xor i64 0, 34359738367
+; CHECK-NEXT:    [[OR:%.*]] = or i64 %j, 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 0, 29
+; CHECK-NEXT:    [[CONV1:%.*]] = select i1 %b, i64 7, i64 0
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[SHL]], [[CONV1]]
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i32 %k to i64
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[SUB]], [[CONV2]]
+; CHECK-NEXT:    [[CONV4:%.*]] = and i64 %p, 65535
+; CHECK-NEXT:    [[AND5:%.*]] = and i64 [[MUL]], [[CONV4]]
+; CHECK-NEXT:    ret i64 [[AND5]]
+;
+  %conv = zext i32 %r to i64
+  %and = and i64 %m, %conv
+  %neg = xor i64 %and, 34359738367
+  %or = or i64 %j, %neg
+  %shl = shl i64 %or, 29
+  %conv1 = select i1 %b, i64 7, i64 0
+  %sub = sub nuw nsw i64 %shl, %conv1
+  %conv2 = zext i32 %k to i64
+  %mul = mul nsw i64 %sub, %conv2
+  %conv4 = and i64 %p, 65535
+  %and5 = and i64 %mul, %conv4
+  ret i64 %and5
+}
+
+; This is a manufactured example based on the 1st test to prove that the
+; assumption-killing algorithm stops at the call. Ie, it does not remove
+; nsw/nuw from the 'add' because a call demands all bits of its argument.
+
+declare i1 @foo(i1)
+
+define i1 @poison_on_call_user_is_ok(i1 %b, i8 %x) {
+; CHECK-LABEL: @poison_on_call_user_is_ok(
+; CHECK-NEXT:    [[SETBIT:%.*]] = or i8 %x, 64
+; CHECK-NEXT:    [[LITTLE_NUMBER:%.*]] = zext i1 %b to i8
+; CHECK-NEXT:    [[BIG_NUMBER:%.*]] = shl i8 0, 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[BIG_NUMBER]], [[LITTLE_NUMBER]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[SUB]] to i1
+; CHECK-NEXT:    [[CALL_RESULT:%.*]] = call i1 @foo(i1 [[TRUNC]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i1 [[CALL_RESULT]], true
+; CHECK-NEXT:    [[MUL:%.*]] = mul i1 [[TRUNC]], [[ADD]]
+; CHECK-NEXT:    ret i1 [[MUL]]
+;
+  %setbit = or i8 %x, 64
+  %little_number = zext i1 %b to i8
+  %big_number = shl i8 %setbit, 1
+  %sub = sub nuw i8 %big_number, %little_number
+  %trunc = trunc i8 %sub to i1
+  %call_result = call i1 @foo(i1 %trunc)
+  %add = add nsw nuw i1 %call_result, 1
+  %mul = mul i1 %trunc, %add
+  ret i1 %mul
+}
+
+
+; We were asserting that all users of a trivialized integer-type instruction were
+; also integer-typed, but that's too strong. The alloca has a pointer-type result.
+
+define void @PR34179(i32* %a) {
+; CHECK-LABEL: @PR34179(
+; CHECK-NEXT:    [[T0:%.*]] = load volatile i32, i32* %a
+; CHECK-NEXT:    ret void
+;
+  %t0 = load volatile i32, i32* %a
+  %vla = alloca i32, i32 %t0
+  ret void
+}
+
diff --git a/test/Transforms/IndVarSimplify/exit_value_test2.ll b/test/Transforms/IndVarSimplify/exit_value_test2.ll
index ee641667506c..7b6e91a742b2 100644
--- a/test/Transforms/IndVarSimplify/exit_value_test2.ll
+++ b/test/Transforms/IndVarSimplify/exit_value_test2.ll
@@ -3,15 +3,14 @@
 
 ; Check IndVarSimplify should not replace exit value because or else
 ; udiv will be introduced by expand and the cost will be high.
-;
-; CHECK-LABEL: @_Z3fooPKcjj(
-; CHECK-NOT: udiv
 
 declare void @_Z3mixRjj(i32* dereferenceable(4), i32)
 declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define i32 @_Z3fooPKcjj(i8* nocapture readonly %s, i32 %len, i32 %c) {
+; CHECK-LABEL: @_Z3fooPKcjj(
+; CHECK-NOT: udiv
 entry:
   %a = alloca i32, align 4
   %tmp = bitcast i32* %a to i8*
@@ -50,3 +49,26 @@ while.end:                                        ; preds = %while.cond.while.en
   call void @llvm.lifetime.end.p0i8(i64 4, i8* %tmp)
   ret i32 %tmp4
 }
+
+define i32 @zero_backedge_count_test(i32 %unknown_init, i32* %unknown_mem) {
+; CHECK-LABEL: @zero_backedge_count_test(
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry], [ %iv.inc, %loop ]
+  %unknown_phi = phi i32 [ %unknown_init, %entry ], [ %unknown_next, %loop ]
+  %iv.inc = add i32 %iv, 1
+  %be_taken = icmp ne i32 %iv.inc, 1
+  %unknown_next = load volatile i32, i32* %unknown_mem
+  br i1 %be_taken, label %loop, label %leave
+
+leave:
+; We can fold %unknown_phi even though the backedge value for it is completely
+; unknown, since we can prove that the loop's backedge taken count is 0.
+
+; CHECK: leave:
+; CHECK: ret i32 %unknown_init
+  %exit_val = phi i32 [ %unknown_phi, %loop ]
+  ret i32 %exit_val
+}
diff --git a/test/Transforms/SimplifyCFG/pr34131.ll b/test/Transforms/SimplifyCFG/pr34131.ll
new file mode 100644
index 000000000000..b64b6876e04e
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/pr34131.ll
@@ -0,0 +1,74 @@
+; RUN: opt -simplifycfg -S < %s | FileCheck %s
+
+; Just checking for lack of crash here, but we should be able to check the IR?
+; Earlier version using auto-generated checks from utils/update_test_checks.py
+; had bot problems though...
+
+define void @patatino() {
+
+; CHECK-LABEL: @patatino
+
+  br label %bb1
+bb1:                                              ; preds = %bb36, %0
+  br label %bb2
+bb2:                                              ; preds = %bb3, %bb1
+  br i1 undef, label %bb4, label %bb3
+bb3:                                              ; preds = %bb4, %bb2
+  br i1 undef, label %bb2, label %bb5
+bb4:                                              ; preds = %bb2
+  switch i32 undef, label %bb3 [
+  ]
+bb5:                                              ; preds = %bb3
+  br label %bb6
+bb6:                                              ; preds = %bb5
+  br i1 undef, label %bb7, label %bb9
+bb7:                                              ; preds = %bb6
+  %tmp = or i64 undef, 1
+  %tmp8 = icmp ult i64 %tmp, 0
+  br i1 %tmp8, label %bb12, label %bb9
+bb9:                                              ; preds = %bb35, %bb34, %bb33, %bb32, %bb31, %bb30, %bb27, %bb24, %bb21, %bb18, %bb16, %bb14, %bb12, %bb7, %bb6
+  br label %bb11
+bb10:                                             ; preds = %bb36
+  br label %bb11
+bb11:                                             ; preds = %bb10, %bb9
+  ret void
+bb12:                                             ; preds = %bb7
+  %tmp13 = icmp ult i64 0, 0
+  br i1 %tmp13, label %bb14, label %bb9
+bb14:                                             ; preds = %bb12
+  %tmp15 = icmp ult i64 undef, 0
+  br i1 %tmp15, label %bb16, label %bb9
+bb16:                                             ; preds = %bb14
+  %tmp17 = icmp ult i64 undef, 0
+  br i1 %tmp17, label %bb18, label %bb9
+bb18:                                             ; preds = %bb16
+  %tmp19 = or i64 undef, 5
+  %tmp20 = icmp ult i64 %tmp19, 0
+  br i1 %tmp20, label %bb21, label %bb9
+bb21:                                             ; preds = %bb18
+  %tmp22 = or i64 undef, 6
+  %tmp23 = icmp ult i64 %tmp22, 0
+  br i1 %tmp23, label %bb24, label %bb9
+bb24:                                             ; preds = %bb21
+  %tmp25 = or i64 undef, 7
+  %tmp26 = icmp ult i64 %tmp25, 0
+  br i1 %tmp26, label %bb27, label %bb9
+bb27:                                             ; preds = %bb24
+  %tmp28 = or i64 undef, 8
+  %tmp29 = icmp ult i64 %tmp28, 0
+  br i1 %tmp29, label %bb30, label %bb9
+bb30:                                             ; preds = %bb27
+  br i1 undef, label %bb31, label %bb9
+bb31:                                             ; preds = %bb30
+  br i1 undef, label %bb32, label %bb9
+bb32:                                             ; preds = %bb31
+  br i1 undef, label %bb33, label %bb9
+bb33:                                             ; preds = %bb32
+  br i1 undef, label %bb34, label %bb9
+bb34:                                             ; preds = %bb33
+  br i1 undef, label %bb35, label %bb9
+bb35:                                             ; preds = %bb34
+  br i1 undef, label %bb36, label %bb9
+bb36:                                             ; preds = %bb35
+  br i1 undef, label %bb1, label %bb10
+}
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index d54b45515f05..74593e6202aa 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -871,7 +871,7 @@ static void printRelocationTargetName(const MachOObjectFile *O,
   uint64_t Val = O->getPlainRelocationSymbolNum(RE);
 
   if (O->getAnyRelocationType(RE) == MachO::ARM64_RELOC_ADDEND) {
-    fmt << format("0x%x", Val);
+    fmt << format("0x%0" PRIx64, Val);
     return;
   } else if (isExtern) {
     symbol_iterator SI = O->symbol_begin();
diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
index 2ef0a8f77ec9..3351ebed54bd 100644
--- a/utils/lit/lit/LitConfig.py
+++ b/utils/lit/lit/LitConfig.py
@@ -25,7 +25,7 @@ class LitConfig(object):
                  params, config_prefix = None,
                  maxIndividualTestTime = 0,
                  maxFailures = None,
-                 parallelism_groups = [],
+                 parallelism_groups = {},
                  echo_all_commands = False):
         # The name of the test runner.
         self.progname = progname
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 46bcac4b306e..a60a0f854870 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -313,7 +313,7 @@ def processRedirects(cmd, stdin_source, cmd_shenv, opened_files):
         elif op == ('<',):
             redirects[0] = [filename, 'r', None]
         else:
-            raise InternalShellError(cmd, "Unsupported redirect: %r" % (r,))
+            raise InternalShellError(cmd, "Unsupported redirect: %r" % ((op, filename),))
 
     # Open file descriptors in a second pass.
     std_fds = [None, None, None]
diff --git a/utils/lit/lit/formats/__init__.py b/utils/lit/lit/formats/__init__.py
index 7d14ca4b535a..3ff46e93ead2 100644
--- a/utils/lit/lit/formats/__init__.py
+++ b/utils/lit/lit/formats/__init__.py
@@ -1,3 +1,8 @@
-from lit.formats.base import TestFormat  # noqa: F401
+from lit.formats.base import (  # noqa: F401
+    TestFormat,
+    FileBasedTest,
+    OneCommandPerFileTest
+)
+
 from lit.formats.googletest import GoogleTest  # noqa: F401
 from lit.formats.shtest import ShTest  # noqa: F401
diff --git a/utils/lit/lit/formats/base.py b/utils/lit/lit/formats/base.py
index baa9ff1d3b7d..6721d17e334e 100644
--- a/utils/lit/lit/formats/base.py
+++ b/utils/lit/lit/formats/base.py
@@ -1,50 +1,117 @@
-import abc
+from __future__ import absolute_import
+import os
+
+import lit.Test
+import lit.util
 
 class TestFormat(object):
-    """Base class for test formats.
-
-    A TestFormat encapsulates logic for finding and executing a certain type of
-    test. For example, a subclass FooTestFormat would contain the logic for
-    finding tests written in the 'Foo' format, and the logic for running a
-    single one.
-
-    TestFormat is an Abstract Base Class (ABC). It uses the Python abc.ABCMeta
-    type and associated @abc.abstractmethod decorator. Together, these provide
-    subclass behaviour which is notionally similar to C++ pure virtual classes:
-    only subclasses which implement all abstract methods can be instantiated
-    (the implementation may come from an intermediate base).
-
-    For details on ABCs, see: https://docs.python.org/2/library/abc.html. Note
-    that Python ABCs have extensive abilities beyond what is used here. For
-    TestFormat, we only care about enforcing that abstract methods are
-    implemented.
-    """
-
-    __metaclass__ = abc.ABCMeta
-
-    @abc.abstractmethod
-    def getTestsInDirectory(self, testSuite, path_in_suite, litConfig,
-                            localConfig):
-      """Finds tests of this format in the given directory.
-
-      Args:
-          testSuite: a Test.TestSuite object.
-          path_in_suite: the subpath under testSuite to look for tests.
-          litConfig: the LitConfig for the test suite.
-          localConfig: a LitConfig with local specializations.
-
-      Returns:
-          An iterable of Test.Test objects.
-      """
-
-    @abc.abstractmethod
+    pass
+
+###
+
+class FileBasedTest(TestFormat):
+    def getTestsInDirectory(self, testSuite, path_in_suite,
+                            litConfig, localConfig):
+        source_path = testSuite.getSourcePath(path_in_suite)
+        for filename in os.listdir(source_path):
+            # Ignore dot files and excluded tests.
+            if (filename.startswith('.') or
+                filename in localConfig.excludes):
+                continue
+
+            filepath = os.path.join(source_path, filename)
+            if not os.path.isdir(filepath):
+                base,ext = os.path.splitext(filename)
+                if ext in localConfig.suffixes:
+                    yield lit.Test.Test(testSuite, path_in_suite + (filename,),
+                                        localConfig)
+
+###
+
+import re
+import tempfile
+
+class OneCommandPerFileTest(TestFormat):
+    # FIXME: Refactor into generic test for running some command on a directory
+    # of inputs.
+
+    def __init__(self, command, dir, recursive=False,
+                 pattern=".*", useTempInput=False):
+        if isinstance(command, str):
+            self.command = [command]
+        else:
+            self.command = list(command)
+        if dir is not None:
+            dir = str(dir)
+        self.dir = dir
+        self.recursive = bool(recursive)
+        self.pattern = re.compile(pattern)
+        self.useTempInput = useTempInput
+
+    def getTestsInDirectory(self, testSuite, path_in_suite,
+                            litConfig, localConfig):
+        dir = self.dir
+        if dir is None:
+            dir = testSuite.getSourcePath(path_in_suite)
+
+        for dirname,subdirs,filenames in os.walk(dir):
+            if not self.recursive:
+                subdirs[:] = []
+
+            subdirs[:] = [d for d in subdirs
+                          if (d != '.svn' and
+                              d not in localConfig.excludes)]
+
+            for filename in filenames:
+                if (filename.startswith('.') or
+                    not self.pattern.match(filename) or
+                    filename in localConfig.excludes):
+                    continue
+
+                path = os.path.join(dirname,filename)
+                suffix = path[len(dir):]
+                if suffix.startswith(os.sep):
+                    suffix = suffix[1:]
+                test = lit.Test.Test(
+                    testSuite, path_in_suite + tuple(suffix.split(os.sep)),
+                    localConfig)
+                # FIXME: Hack?
+                test.source_path = path
+                yield test
+
+    def createTempInput(self, tmp, test):
+        raise NotImplementedError('This is an abstract method.')
+
     def execute(self, test, litConfig):
-      """Runs the given 'test', which is of this format.
+        if test.config.unsupported:
+            return (lit.Test.UNSUPPORTED, 'Test is unsupported')
+
+        cmd = list(self.command)
+
+        # If using temp input, create a temporary file and hand it to the
+        # subclass.
+        if self.useTempInput:
+            tmp = tempfile.NamedTemporaryFile(suffix='.cpp')
+            self.createTempInput(tmp, test)
+            tmp.flush()
+            cmd.append(tmp.name)
+        elif hasattr(test, 'source_path'):
+            cmd.append(test.source_path)
+        else:
+            cmd.append(test.getSourcePath())
+
+        out, err, exitCode = lit.util.executeCommand(cmd)
+
+        diags = out + err
+        if not exitCode and not diags.strip():
+            return lit.Test.PASS,''
 
-      Args:
-          test: a Test.Test object describing the test to run.
-          litConfig: the LitConfig for the test suite.
+        # Try to include some useful information.
+        report = """Command: %s\n""" % ' '.join(["'%s'" % a
+                                                 for a in cmd])
+        if self.useTempInput:
+            report += """Temporary File: %s\n""" % tmp.name
+            report += "--\n%s--\n""" % open(tmp.name).read()
+        report += """Output:\n--\n%s--""" % diags
 
-      Returns:
-          A tuple of (status:Test.ResultCode, message:str)
-      """
+        return lit.Test.FAIL, report
diff --git a/utils/lit/lit/formats/shtest.py b/utils/lit/lit/formats/shtest.py
index 01ecd192092e..fdc9bd0241f3 100644
--- a/utils/lit/lit/formats/shtest.py
+++ b/utils/lit/lit/formats/shtest.py
@@ -1,13 +1,12 @@
 from __future__ import absolute_import
 
-import os
-
-import lit.Test
 import lit.TestRunner
 import lit.util
-from .base import TestFormat
 
-class ShTest(TestFormat):
+from .base import FileBasedTest
+
+
+class ShTest(FileBasedTest):
     """ShTest is a format with one file per test.
 
     This is the primary format for regression tests as described in the LLVM
@@ -18,31 +17,9 @@ class ShTest(TestFormat):
     The ShTest files contain some number of shell-like command pipelines, along
     with assertions about what should be in the output.
     """
-
-    def __init__(self, execute_external = False):
-        """Initializer.
-
-        The 'execute_external' argument controls whether lit uses its internal
-        logic for command pipelines, or passes the command to a shell
-        subprocess.
-
-        Args:
-            execute_external: (optional) If true, use shell subprocesses instead
-                of lit's internal pipeline logic.
-        """
+    def __init__(self, execute_external=False):
         self.execute_external = execute_external
 
-    def getTestsInDirectory(self, testSuite, path_in_suite,
-                            litConfig, localConfig):
-        """Yields test files matching 'suffixes' from the localConfig."""
-        file_matches = lit.util.listdir_files(
-            testSuite.getSourcePath(path_in_suite),
-            localConfig.suffixes, localConfig.excludes)
-        for filename in file_matches:
-            yield lit.Test.Test(testSuite, path_in_suite + (filename,),
-                                localConfig)
-
     def execute(self, test, litConfig):
-        """Interprets and runs the given test file, and returns the result."""
         return lit.TestRunner.executeShTest(test, litConfig,
                                             self.execute_external)
diff --git a/utils/lit/lit/run.py b/utils/lit/lit/run.py
index 1290c142c834..3e39bdb92203 100644
--- a/utils/lit/lit/run.py
+++ b/utils/lit/lit/run.py
@@ -44,6 +44,12 @@ class Run(object):
     def __init__(self, lit_config, tests):
         self.lit_config = lit_config
         self.tests = tests
+        # Set up semaphores to limit parallelism of certain classes of tests.
+        # For example, some ASan tests require lots of virtual memory and run
+        # faster with less parallelism on OS X.
+        self.parallelism_semaphores = \
+                {k: multiprocessing.Semaphore(v) for k, v in
+                 self.lit_config.parallelism_groups.items()}
 
     def execute_test(self, test):
         return _execute_test_impl(test, self.lit_config,
@@ -74,13 +80,6 @@ class Run(object):
         if not self.tests or jobs == 0:
             return
 
-        # Set up semaphores to limit parallelism of certain classes of tests.
-        # For example, some ASan tests require lots of virtual memory and run
-        # faster with less parallelism on OS X.
-        self.parallelism_semaphores = \
-                {k: multiprocessing.Semaphore(v) for k, v in
-                 self.lit_config.parallelism_groups.items()}
-
         # Install a console-control signal handler on Windows.
         if win32api is not None:
             def console_ctrl_handler(type):
diff --git a/utils/lit/tests/Inputs/max-failures/lit.cfg b/utils/lit/tests/Inputs/max-failures/lit.cfg
new file mode 100644
index 000000000000..50d07566e1cc
--- /dev/null
+++ b/utils/lit/tests/Inputs/max-failures/lit.cfg
@@ -0,0 +1,6 @@
+import lit.formats
+config.name = 'shtest-shell'
+config.suffixes = ['.txt']
+config.test_format = lit.formats.ShTest()
+config.test_source_root = os.path.dirname(__file__) + '/../shtest-shell'
+config.test_exec_root = None
diff --git a/utils/lit/tests/max-failures.py b/utils/lit/tests/max-failures.py
index 5cc258dd08aa..bc58e9a4e47f 100644
--- a/utils/lit/tests/max-failures.py
+++ b/utils/lit/tests/max-failures.py
@@ -1,9 +1,9 @@
 # Check the behavior of --max-failures option.
 #
-# RUN: not %{lit} -j 1 -v %{inputs}/shtest-shell > %t.out
-# RUN: not %{lit} --max-failures=1 -j 1 -v %{inputs}/shtest-shell >> %t.out
-# RUN: not %{lit} --max-failures=2 -j 1 -v %{inputs}/shtest-shell >> %t.out
-# RUN: not %{lit} --max-failures=0 -j 1 -v %{inputs}/shtest-shell 2>> %t.out
+# RUN: not %{lit} -j 1 -v %{inputs}/max-failures > %t.out
+# RUN: not %{lit} --max-failures=1 -j 1 -v %{inputs}/max-failures >> %t.out
+# RUN: not %{lit} --max-failures=2 -j 1 -v %{inputs}/max-failures >> %t.out
+# RUN: not %{lit} --max-failures=0 -j 1 -v %{inputs}/max-failures 2>> %t.out
 # RUN: FileCheck < %t.out %s
 #
 # END.
diff --git a/utils/lit/tests/selecting.py b/utils/lit/tests/selecting.py
index 19ba240f9b0f..4a0d08b860b8 100644
--- a/utils/lit/tests/selecting.py
+++ b/utils/lit/tests/selecting.py
@@ -9,7 +9,7 @@
 
 # Check that regex-filtering based on environment variables work.
 #
-# RUN: LIT_FILTER='o[a-z]e' %{lit} %{inputs}/discovery | FileCheck --check-prefix=CHECK-FILTER-ENV %s
+# RUN: env LIT_FILTER='o[a-z]e' %{lit} %{inputs}/discovery | FileCheck --check-prefix=CHECK-FILTER-ENV %s
 # CHECK-FILTER-ENV: Testing: 2 of 5 tests
 
 
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index 02d8e7925f6e..66a2c578083e 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -403,14 +403,6 @@ function test_llvmCore() {
     fi
 
     if [ $do_test_suite = 'yes' ]; then
-      SandboxDir="$BuildDir/sandbox"
-      Lit=$SandboxDir/bin/lit
-      TestSuiteBuildDir="$BuildDir/test-suite-build"
-      TestSuiteSrcDir="$BuildDir/test-suite.src"
-
-      virtualenv $SandboxDir
-      $SandboxDir/bin/python $BuildDir/llvm.src/utils/lit/setup.py install
-      mkdir -p $TestSuiteBuildDir
       cd $TestSuiteBuildDir
       env CC="$c_compiler" CXX="$cxx_compiler" \
           cmake $TestSuiteSrcDir -DTEST_SUITE_LIT=$Lit
@@ -466,6 +458,19 @@ if [ "$do_checkout" = "yes" ]; then
     export_sources
 fi
 
+# Setup the test-suite.  Do this early so we can catch failures before
+# we do the full 3 stage build.
+if [ $do_test_suite = "yes" ]; then
+  SandboxDir="$BuildDir/sandbox"
+  Lit=$SandboxDir/bin/lit
+  TestSuiteBuildDir="$BuildDir/test-suite-build"
+  TestSuiteSrcDir="$BuildDir/test-suite.src"
+
+  virtualenv $SandboxDir
+  $SandboxDir/bin/python $BuildDir/llvm.src/utils/lit/setup.py install
+  mkdir -p $TestSuiteBuildDir
+fi
+
 (
 Flavors="Release"
 if [ "$do_debug" = "yes" ]; then