423 files changed, 14374 insertions, 10340 deletions
diff --git a/docs/Extensions.rst b/docs/Extensions.rst
index 14fea30204b4..32eeadd78ba6 100644
--- a/docs/Extensions.rst
+++ b/docs/Extensions.rst
@@ -288,3 +288,31 @@ standard stack probe emission.
 
 The MSVC environment does not emit code for VLAs currently.
 
+Windows on ARM64
+----------------
+
+Stack Probe Emission
+^^^^^^^^^^^^^^^^^^^^
+
+The reference implementation (Microsoft Visual Studio 2017) emits stack probes
+in the following fashion:
+
+.. code-block:: gas
+
+  mov x15, #constant
+  bl __chkstk
+  sub sp, sp, x15, lsl #4
+
+However, this has the limitation of 256 MiB (±128MiB).  In order to accommodate
+larger binaries, LLVM supports the use of ``-mcode-model=large`` to allow a 8GiB
+(±4GiB) range via a slight deviation.  It will generate an indirect jump as
+follows:
+
+.. code-block:: gas
+
+  mov x15, #constant
+  adrp x16, __chkstk
+  add x16, x16, :lo12:__chkstk
+  blr x16
+  sub sp, sp, x15, lsl #4
+
diff --git a/docs/MIRLangRef.rst b/docs/MIRLangRef.rst
index f170c7210879..1176435c8761 100644
--- a/docs/MIRLangRef.rst
+++ b/docs/MIRLangRef.rst
@@ -692,6 +692,50 @@ The syntax is:
 
     EH_LABEL <mcsymbol Ltmp1>
 
+CFIIndex Operands
+^^^^^^^^^^^^^^^^^
+
+A CFI Index operand is holding an index into a per-function side-table,
+``MachineFunction::getFrameInstructions()``, which references all the frame
+instructions in a ``MachineFunction``. A ``CFI_INSTRUCTION`` may look like it
+contains multiple operands, but the only operand it contains is the CFI Index.
+The other operands are tracked by the ``MCCFIInstruction`` object.
+
+The syntax is:
+
+.. code-block:: text
+
+    CFI_INSTRUCTION offset %w30, -16
+
+which may be emitted later in the MC layer as:
+
+.. code-block:: text
+
+    .cfi_offset w30, -16
+
+IntrinsicID Operands
+^^^^^^^^^^^^^^^^^^^^
+
+An Intrinsic ID operand contains a generic intrinsic ID or a target-specific ID.
+
+The syntax for the ``returnaddress`` intrinsic is:
+
+.. code-block:: text
+
+   %x0 = COPY intrinsic(@llvm.returnaddress)
+
+Predicate Operands
+^^^^^^^^^^^^^^^^^^
+
+A Predicate operand contains an IR predicate from ``CmpInst::Predicate``, like
+``ICMP_EQ``, etc.
+
+For an int eq predicate ``ICMP_EQ``, the syntax is:
+
+.. code-block:: text
+
+   %2:gpr(s32) = G_ICMP intpred(eq), %0, %1
+
 .. TODO: Describe the parsers default behaviour when optional YAML attributes
    are missing.
 .. TODO: Describe the syntax for the bundled instructions.
@@ -702,7 +746,6 @@ The syntax is:
 .. TODO: Describe the syntax of the stack object machine operands and their
    YAML definitions.
 .. TODO: Describe the syntax of the block address machine operands.
-.. TODO: Describe the syntax of the CFI index machine operands.
 .. TODO: Describe the syntax of the metadata machine operands, and the
    instructions debug location attribute.
 .. TODO: Describe the syntax of the register live out machine operands.
diff --git a/docs/tutorial/LangImpl09.rst b/docs/tutorial/LangImpl09.rst
index fe5a95a5769e..d81f9fa0001c 100644
--- a/docs/tutorial/LangImpl09.rst
+++ b/docs/tutorial/LangImpl09.rst
@@ -197,7 +197,7 @@ expressions:
     if (DblTy)
       return DblTy;
 
-    DblTy = DBuilder->createBasicType("double", 64, 64, dwarf::DW_ATE_float);
+    DblTy = DBuilder->createBasicType("double", 64, dwarf::DW_ATE_float);
     return DblTy;
   }
 
@@ -208,7 +208,8 @@ And then later on in ``main`` when we're constructing our module:
   DBuilder = new DIBuilder(*TheModule);
 
   KSDbgInfo.TheCU = DBuilder->createCompileUnit(
-      dwarf::DW_LANG_C, "fib.ks", ".", "Kaleidoscope Compiler", 0, "", 0);
+      dwarf::DW_LANG_C, DBuilder->createFile("fib.ks", "."),
+      "Kaleidoscope Compiler", 0, "", 0);
 
 There are a couple of things to note here. First, while we're producing a
 compile unit for a language called Kaleidoscope we used the language
diff --git a/examples/Kaleidoscope/CMakeLists.txt b/examples/Kaleidoscope/CMakeLists.txt
index 543b9f73b4fe..3822cdd9e1c4 100644
--- a/examples/Kaleidoscope/CMakeLists.txt
+++ b/examples/Kaleidoscope/CMakeLists.txt
@@ -14,3 +14,4 @@ add_subdirectory(Chapter5)
 add_subdirectory(Chapter6)
 add_subdirectory(Chapter7)
 add_subdirectory(Chapter8)
+add_subdirectory(Chapter9)
diff --git a/examples/Kaleidoscope/Chapter9/toy.cpp b/examples/Kaleidoscope/Chapter9/toy.cpp
index 1b13e45ec460..821cf4d25a65 100644
--- a/examples/Kaleidoscope/Chapter9/toy.cpp
+++ b/examples/Kaleidoscope/Chapter9/toy.cpp
@@ -823,7 +823,7 @@ DIType *DebugInfo::getDoubleTy() {
   if (DblTy)
     return DblTy;
 
-  DblTy = DBuilder->createBasicType("double", 64, 64, dwarf::DW_ATE_float);
+  DblTy = DBuilder->createBasicType("double", 64, dwarf::DW_ATE_float);
   return DblTy;
 }
 
@@ -1436,7 +1436,8 @@ int main() {
   // Currently down as "fib.ks" as a filename since we're redirecting stdin
   // but we'd like actual source locations.
   KSDbgInfo.TheCU = DBuilder->createCompileUnit(
-      dwarf::DW_LANG_C, "fib.ks", ".", "Kaleidoscope Compiler", 0, "", 0);
+      dwarf::DW_LANG_C, DBuilder->createFile("fib.ks", "."),
+      "Kaleidoscope Compiler", 0, "", 0);
 
   // Run the main "interpreter loop" now.
   MainLoop();
diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index 8d45b7832041..55f3e46c45ed 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h
@@ -757,17 +757,17 @@ extern void thinlto_codegen_add_cross_referenced_symbol(thinlto_code_gen_t cg,
  * @ingroup LLVMCTLTO
  *
  * These entry points control the ThinLTO cache. The cache is intended to
- * support incremental build, and thus needs to be persistent accross build.
- * The client enabled the cache by supplying a path to an existing directory.
+ * support incremental builds, and thus needs to be persistent across builds.
+ * The client enables the cache by supplying a path to an existing directory.
  * The code generator will use this to store objects files that may be reused
  * during a subsequent build.
  * To avoid filling the disk space, a few knobs are provided:
- *  - The pruning interval limit the frequency at which the garbage collector
- *    will try to scan the cache directory to prune it from expired entries.
- *    Setting to -1 disable the pruning (default).
+ *  - The pruning interval limits the frequency at which the garbage collector
+ *    will try to scan the cache directory to prune expired entries.
+ *    Setting to a negative number disables the pruning.
  *  - The pruning expiration time indicates to the garbage collector how old an
  *    entry needs to be to be removed.
- *  - Finally, the garbage collector can be instructed to prune the cache till
+ *  - Finally, the garbage collector can be instructed to prune the cache until
  *    the occupied space goes below a threshold.
  * @{
  */
@@ -782,7 +782,7 @@ extern void thinlto_codegen_set_cache_dir(thinlto_code_gen_t cg,
                                           const char *cache_dir);
 
 /**
- * Sets the cache pruning interval (in seconds). A negative value disable the
+ * Sets the cache pruning interval (in seconds). A negative value disables the
  * pruning. An unspecified default value will be applied, and a value of 0 will
  * be ignored.
  *
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 9de075dfd681..362096b08e13 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -95,46 +95,81 @@ enum AliasResult {
 ///
 /// This is no access at all, a modification, a reference, or both
 /// a modification and a reference. These are specifically structured such that
-/// they form a two bit matrix and bit-tests for 'mod' or 'ref'
+/// they form a three bit matrix and bit-tests for 'mod' or 'ref' or 'must'
 /// work with any of the possible values.
-
 enum class ModRefInfo {
+  /// Must is provided for completeness, but no routines will return only
+  /// Must today. See definition of Must below.
+  Must = 0,
+  /// The access may reference the value stored in memory,
+  /// a mustAlias relation was found, and no mayAlias or partialAlias found.
+  MustRef = 1,
+  /// The access may modify the value stored in memory,
+  /// a mustAlias relation was found, and no mayAlias or partialAlias found.
+  MustMod = 2,
+  /// The access may reference, modify or both the value stored in memory,
+  /// a mustAlias relation was found, and no mayAlias or partialAlias found.
+  MustModRef = MustRef | MustMod,
   /// The access neither references nor modifies the value stored in memory.
-  NoModRef = 0,
+  NoModRef = 4,
   /// The access may reference the value stored in memory.
-  Ref = 1,
+  Ref = NoModRef | MustRef,
   /// The access may modify the value stored in memory.
-  Mod = 2,
+  Mod = NoModRef | MustMod,
   /// The access may reference and may modify the value stored in memory.
   ModRef = Ref | Mod,
+
+  /// About Must:
+  /// Must is set in a best effort manner.
+  /// We usually do not try our best to infer Must, instead it is merely
+  /// another piece of "free" information that is presented when available.
+  /// Must set means there was certainly a MustAlias found. For calls,
+  /// where multiple arguments are checked (argmemonly), this translates to
+  /// only MustAlias or NoAlias was found.
+  /// Must is not set for RAR accesses, even if the two locations must
+  /// alias. The reason is that two read accesses translate to an early return
+  /// of NoModRef. An additional alias check to set Must may be
+  /// expensive. Other cases may also not set Must(e.g. callCapturesBefore).
+  /// We refer to Must being *set* when the most significant bit is *cleared*.
+  /// Conversely we *clear* Must information by *setting* the Must bit to 1.
 };
 
 LLVM_NODISCARD inline bool isNoModRef(const ModRefInfo MRI) {
-  return MRI == ModRefInfo::NoModRef;
+  return (static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustModRef)) ==
+         static_cast<int>(ModRefInfo::Must);
 }
 LLVM_NODISCARD inline bool isModOrRefSet(const ModRefInfo MRI) {
-  return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::ModRef);
+  return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustModRef);
 }
 LLVM_NODISCARD inline bool isModAndRefSet(const ModRefInfo MRI) {
-  return (static_cast<int>(MRI) & static_cast<int>(ModRefInfo::ModRef)) ==
-         static_cast<int>(ModRefInfo::ModRef);
+  return (static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustModRef)) ==
+         static_cast<int>(ModRefInfo::MustModRef);
 }
 LLVM_NODISCARD inline bool isModSet(const ModRefInfo MRI) {
-  return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::Mod);
+  return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustMod);
 }
 LLVM_NODISCARD inline bool isRefSet(const ModRefInfo MRI) {
-  return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::Ref);
+  return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustRef);
+}
+LLVM_NODISCARD inline bool isMustSet(const ModRefInfo MRI) {
+  return !(static_cast<int>(MRI) & static_cast<int>(ModRefInfo::NoModRef));
 }
 
 LLVM_NODISCARD inline ModRefInfo setMod(const ModRefInfo MRI) {
-  return ModRefInfo(static_cast<int>(MRI) | static_cast<int>(ModRefInfo::Mod));
+  return ModRefInfo(static_cast<int>(MRI) |
+                    static_cast<int>(ModRefInfo::MustMod));
 }
 LLVM_NODISCARD inline ModRefInfo setRef(const ModRefInfo MRI) {
-  return ModRefInfo(static_cast<int>(MRI) | static_cast<int>(ModRefInfo::Ref));
+  return ModRefInfo(static_cast<int>(MRI) |
+                    static_cast<int>(ModRefInfo::MustRef));
+}
+LLVM_NODISCARD inline ModRefInfo setMust(const ModRefInfo MRI) {
+  return ModRefInfo(static_cast<int>(MRI) &
+                    static_cast<int>(ModRefInfo::MustModRef));
 }
 LLVM_NODISCARD inline ModRefInfo setModAndRef(const ModRefInfo MRI) {
   return ModRefInfo(static_cast<int>(MRI) |
-                    static_cast<int>(ModRefInfo::ModRef));
+                    static_cast<int>(ModRefInfo::MustModRef));
 }
 LLVM_NODISCARD inline ModRefInfo clearMod(const ModRefInfo MRI) {
   return ModRefInfo(static_cast<int>(MRI) & static_cast<int>(ModRefInfo::Ref));
@@ -142,6 +177,10 @@ LLVM_NODISCARD inline ModRefInfo clearMod(const ModRefInfo MRI) {
 LLVM_NODISCARD inline ModRefInfo clearRef(const ModRefInfo MRI) {
   return ModRefInfo(static_cast<int>(MRI) & static_cast<int>(ModRefInfo::Mod));
 }
+LLVM_NODISCARD inline ModRefInfo clearMust(const ModRefInfo MRI) {
+  return ModRefInfo(static_cast<int>(MRI) |
+                    static_cast<int>(ModRefInfo::NoModRef));
+}
 LLVM_NODISCARD inline ModRefInfo unionModRef(const ModRefInfo MRI1,
                                              const ModRefInfo MRI2) {
   return ModRefInfo(static_cast<int>(MRI1) | static_cast<int>(MRI2));
@@ -160,11 +199,11 @@ enum FunctionModRefLocation {
   /// Base case is no access to memory.
   FMRL_Nowhere = 0,
   /// Access to memory via argument pointers.
-  FMRL_ArgumentPointees = 4,
+  FMRL_ArgumentPointees = 8,
   /// Memory that is inaccessible via LLVM IR.
-  FMRL_InaccessibleMem = 8,
+  FMRL_InaccessibleMem = 16,
   /// Access to any memory.
-  FMRL_Anywhere = 16 | FMRL_InaccessibleMem | FMRL_ArgumentPointees
+  FMRL_Anywhere = 32 | FMRL_InaccessibleMem | FMRL_ArgumentPointees
 };
 
 /// Summary of how a function affects memory in the program.
@@ -344,7 +383,7 @@ public:
   /// result's bits are set to indicate the allowed aliasing ModRef kinds. Note
   /// that these bits do not necessarily account for the overall behavior of
   /// the function, but rather only provide additional per-argument
-  /// information.
+  /// information. This never sets ModRefInfo::Must.
   ModRefInfo getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx);
 
   /// Return the behavior of the given call site.
@@ -624,6 +663,8 @@ public:
   /// or reads the specified memory location \p MemLoc before instruction \p I
   /// in a BasicBlock. An ordered basic block \p OBB can be used to speed up
   /// instruction ordering queries inside the BasicBlock containing \p I.
+  /// Early exits in callCapturesBefore may lead to ModRefInfo::Must not being
+  /// set.
   ModRefInfo callCapturesBefore(const Instruction *I,
                                 const MemoryLocation &MemLoc, DominatorTree *DT,
                                 OrderedBasicBlock *OBB = nullptr);
diff --git a/include/llvm/Analysis/AliasAnalysisEvaluator.h b/include/llvm/Analysis/AliasAnalysisEvaluator.h
index 214574852655..cd2f631a01f4 100644
--- a/include/llvm/Analysis/AliasAnalysisEvaluator.h
+++ b/include/llvm/Analysis/AliasAnalysisEvaluator.h
@@ -35,19 +35,23 @@ class AAEvaluator : public PassInfoMixin<AAEvaluator> {
   int64_t FunctionCount;
   int64_t NoAliasCount, MayAliasCount, PartialAliasCount, MustAliasCount;
   int64_t NoModRefCount, ModCount, RefCount, ModRefCount;
+  int64_t MustCount, MustRefCount, MustModCount, MustModRefCount;
 
 public:
   AAEvaluator()
       : FunctionCount(), NoAliasCount(), MayAliasCount(), PartialAliasCount(),
         MustAliasCount(), NoModRefCount(), ModCount(), RefCount(),
-        ModRefCount() {}
+        ModRefCount(), MustCount(), MustRefCount(), MustModCount(),
+        MustModRefCount() {}
   AAEvaluator(AAEvaluator &&Arg)
       : FunctionCount(Arg.FunctionCount), NoAliasCount(Arg.NoAliasCount),
         MayAliasCount(Arg.MayAliasCount),
         PartialAliasCount(Arg.PartialAliasCount),
         MustAliasCount(Arg.MustAliasCount), NoModRefCount(Arg.NoModRefCount),
         ModCount(Arg.ModCount), RefCount(Arg.RefCount),
-        ModRefCount(Arg.ModRefCount) {
+        ModRefCount(Arg.ModRefCount), MustCount(Arg.MustCount),
+        MustRefCount(Arg.MustRefCount), MustModCount(Arg.MustModCount),
+        MustModRefCount(Arg.MustModRefCount) {
     Arg.FunctionCount = 0;
   }
   ~AAEvaluator();
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 54f151ef82e2..28154c873b70 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -667,21 +667,6 @@ int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
                      const ValueToValueMap &StridesMap = ValueToValueMap(),
                      bool Assume = false, bool ShouldCheckWrap = true);
 
-/// \brief Attempt to sort the 'loads' in \p VL and return the sorted values in
-/// \p Sorted.
-///
-/// Returns 'false' if sorting is not legal or feasible, otherwise returns
-/// 'true'. If \p Mask is not null, it also returns the \p Mask which is the
-/// shuffle mask for actual memory access order.
-///
-/// For example, for a given VL of memory accesses in program order, a[i+2],
-/// a[i+0], a[i+1] and a[i+3], this function will sort the VL and save the
-/// sorted value in 'Sorted' as a[i+0], a[i+1], a[i+2], a[i+3] and saves the
-/// mask for actual memory accesses in program order in 'Mask' as <2,0,1,3>
-bool sortLoadAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
-    ScalarEvolution &SE, SmallVectorImpl<Value *> &Sorted,
-    SmallVectorImpl<unsigned> *Mask = nullptr);
-
 /// \brief Returns true if the memory operations \p A and \p B are consecutive.
 /// This is a simple API that does not depend on the analysis pass. 
 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index c2974525a6ff..391a333594e9 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -407,6 +407,12 @@ public:
   void getNonLocalPointerDependency(Instruction *QueryInst,
                                     SmallVectorImpl<NonLocalDepResult> &Result);
 
+  /// Perform a dependency query specifically for QueryInst's access to Loc.
+  /// The other comments for getNonLocalPointerDependency apply here as well.
+  void getNonLocalPointerDependencyFrom(Instruction *QueryInst,
+                                        const MemoryLocation &Loc, bool isLoad,
+                                        SmallVectorImpl<NonLocalDepResult> &Result);
+
   /// Removes an instruction from the dependence analysis, updating the
   /// dependence of instructions that previously depended on it.
   void removeInstruction(Instruction *InstToRemove);
diff --git a/include/llvm/Analysis/ProfileSummaryInfo.h b/include/llvm/Analysis/ProfileSummaryInfo.h
index bd7b00374821..293033458429 100644
--- a/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -92,12 +92,12 @@ public:
   bool hasHugeWorkingSetSize();
   /// \brief Returns true if \p F has hot function entry.
   bool isFunctionEntryHot(const Function *F);
-  /// Returns true if \p F has hot function entry or hot call edge.
-  bool isFunctionHotInCallGraph(const Function *F);
+  /// Returns true if \p F contains hot code.
+  bool isFunctionHotInCallGraph(const Function *F, BlockFrequencyInfo &BFI);
   /// \brief Returns true if \p F has cold function entry.
   bool isFunctionEntryCold(const Function *F);
-  /// Returns true if \p F has cold function entry or cold call edge.
-  bool isFunctionColdInCallGraph(const Function *F);
+  /// Returns true if \p F contains only cold code.
+  bool isFunctionColdInCallGraph(const Function *F, BlockFrequencyInfo &BFI);
   /// \brief Returns true if \p F is a hot function.
   bool isHotCount(uint64_t C);
   /// \brief Returns true if count \p C is considered cold.
diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h
index 4578e0da8ab2..3df04e98bd24 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -47,7 +47,7 @@ namespace llvm {
     ScalarEvolution &SE;
     const DataLayout &DL;
 
-    // New instructions receive a name to identifies them with the current pass.
+    // New instructions receive a name to identify them with the current pass.
     const char* IVName;
 
     // InsertedExpressions caches Values for reuse, so must track RAUW.
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index c20f20cfbe4d..cecd8958e9d9 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -646,6 +646,9 @@ public:
   /// \brief Additional properties of an operand's values.
   enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
 
+  /// \return True if target can execute instructions out of order.
+  bool isOutOfOrder() const;
+
   /// \return The number of scalar or vector registers that the target has.
   /// If 'Vectors' is true, it returns the number of vector registers. If it is
   /// set to false, it returns the number of scalar registers.
@@ -1018,6 +1021,7 @@ public:
                             Type *Ty) = 0;
   virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                             Type *Ty) = 0;
+  virtual bool isOutOfOrder() const = 0;
   virtual unsigned getNumberOfRegisters(bool Vector) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
@@ -1295,6 +1299,9 @@ public:
                     Type *Ty) override {
     return Impl.getIntImmCost(IID, Idx, Imm, Ty);
   }
+  bool isOutOfOrder() const override {
+    return Impl.isOutOfOrder();
+  }
   unsigned getNumberOfRegisters(bool Vector) override {
     return Impl.getNumberOfRegisters(Vector);
   }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4c37402278ef..3625675d53de 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -337,6 +337,8 @@ public:
     return TTI::TCC_Free;
   }
 
+  bool isOutOfOrder() const { return false; }
+
   unsigned getNumberOfRegisters(bool Vector) { return 8; }
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
diff --git a/include/llvm/BinaryFormat/Wasm.h b/include/llvm/BinaryFormat/Wasm.h
index 506cd0393e9a..57a0b441821b 100644
--- a/include/llvm/BinaryFormat/Wasm.h
+++ b/include/llvm/BinaryFormat/Wasm.h
@@ -208,7 +208,7 @@ const unsigned WASM_SYMBOL_VISIBILITY_HIDDEN  = 0x4;
 #define WASM_RELOC(name, value) name = value,
 
 enum : unsigned {
-#include "WasmRelocs/WebAssembly.def"
+#include "WasmRelocs.def"
 };
 
 #undef WASM_RELOC
diff --git a/include/llvm/BinaryFormat/WasmRelocs/WebAssembly.def b/include/llvm/BinaryFormat/WasmRelocs.def
index d6f0e42b33bf..d6f0e42b33bf 100644
--- a/include/llvm/BinaryFormat/WasmRelocs/WebAssembly.def
+++ b/include/llvm/BinaryFormat/WasmRelocs.def
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index bb5e7f9e8e30..f1f9275b0786 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -302,9 +302,13 @@ public:
   }
 
   unsigned getFPOpCost(Type *Ty) {
-    // By default, FP instructions are no more expensive since they are
-    // implemented in HW.  Target specific TTI can override this.
-    return TargetTransformInfo::TCC_Basic;
+    // Check whether FADD is available, as a proxy for floating-point in
+    // general.
+    const TargetLoweringBase *TLI = getTLI();
+    EVT VT = TLI->getValueType(DL, Ty);
+    if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
+      return TargetTransformInfo::TCC_Basic;
+    return TargetTransformInfo::TCC_Expensive;
   }
 
   unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) {
@@ -398,6 +402,10 @@ public:
     return BaseT::getInstructionLatency(I);
   }
 
+  bool isOutOfOrder() const {
+    return getST()->getSchedModel().isOutOfOrder();
+  }
+
   /// @}
 
   /// \name Vector TTI Implementations
diff --git a/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
index e599a1b179ec..4264a866b6c0 100644
--- a/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
+++ b/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
@@ -282,10 +282,6 @@ enum {
 /// Provides the logic to select generic machine instructions.
 class InstructionSelector {
 public:
-  using I64ImmediatePredicateFn = bool (*)(int64_t);
-  using APIntImmediatePredicateFn = bool (*)(const APInt &);
-  using APFloatImmediatePredicateFn = bool (*)(const APFloat &);
-
   virtual ~InstructionSelector() = default;
 
   /// Select the (possibly generic) instruction \p I to only use target-specific
@@ -319,9 +315,6 @@ public:
   struct MatcherInfoTy {
     const LLT *TypeObjects;
     const PredicateBitset *FeatureBitsets;
-    const I64ImmediatePredicateFn *I64ImmPredicateFns;
-    const APIntImmediatePredicateFn *APIntImmPredicateFns;
-    const APFloatImmediatePredicateFn *APFloatImmPredicateFns;
     const ComplexMatcherMemFn *ComplexPredicates;
   };
 
@@ -340,6 +333,16 @@ protected:
       const RegisterBankInfo &RBI, const PredicateBitset &AvailableFeatures,
       CodeGenCoverage &CoverageInfo) const;
 
+  virtual bool testImmPredicate_I64(unsigned, int64_t) const {
+    llvm_unreachable("Subclasses must override this to use tablegen");
+  }
+  virtual bool testImmPredicate_APInt(unsigned, const APInt &) const {
+    llvm_unreachable("Subclasses must override this to use tablegen");
+  }
+  virtual bool testImmPredicate_APFloat(unsigned, const APFloat &) const {
+    llvm_unreachable("Subclasses must override this to use tablegen");
+  }
+
   /// Constrain a register operand of an instruction \p I to a specified
   /// register class. This could involve inserting COPYs before (for uses) or
   /// after (for defs) and may replace the operand of \p I.
diff --git a/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h b/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
index ac2c055ab145..bf834cf8f5e3 100644
--- a/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
+++ b/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h
@@ -181,7 +181,7 @@ bool InstructionSelector::executeMatchTable(
       else
         llvm_unreachable("Expected Imm or CImm operand");
 
-      if (!MatcherInfo.I64ImmPredicateFns[Predicate](Value))
+      if (!testImmPredicate_I64(Predicate, Value))
         if (handleReject() == RejectAndGiveUp)
           return false;
       break;
@@ -202,7 +202,7 @@ bool InstructionSelector::executeMatchTable(
       else
         llvm_unreachable("Expected Imm or CImm operand");
 
-      if (!MatcherInfo.APIntImmPredicateFns[Predicate](Value))
+      if (!testImmPredicate_APInt(Predicate, Value))
         if (handleReject() == RejectAndGiveUp)
           return false;
       break;
@@ -221,7 +221,7 @@ bool InstructionSelector::executeMatchTable(
       assert(Predicate > GIPFP_APFloat_Invalid && "Expected a valid predicate");
       APFloat Value = State.MIs[InsnID]->getOperand(1).getFPImm()->getValueAPF();
 
-      if (!MatcherInfo.APFloatImmPredicateFns[Predicate](Value))
+      if (!testImmPredicate_APFloat(Predicate, Value))
         if (handleReject() == RejectAndGiveUp)
           return false;
       break;
diff --git a/include/llvm/CodeGen/LiveStackAnalysis.h b/include/llvm/CodeGen/LiveStacks.h
index c90ae7b184f4..44ed785f7b53 100644
--- a/include/llvm/CodeGen/LiveStackAnalysis.h
+++ b/include/llvm/CodeGen/LiveStacks.h
@@ -1,4 +1,4 @@
-//===- LiveStackAnalysis.h - Live Stack Slot Analysis -----------*- C++ -*-===//
+//===- LiveStacks.h - Live Stack Slot Analysis ------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_LIVESTACKANALYSIS_H
-#define LLVM_CODEGEN_LIVESTACKANALYSIS_H
+#ifndef LLVM_CODEGEN_LIVESTACKS_H
+#define LLVM_CODEGEN_LIVESTACKS_H
 
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -100,4 +100,4 @@ public:
 
 } // end namespace llvm
 
-#endif // LLVM_CODEGEN_LIVESTACK_ANALYSIS_H
+#endif
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index ccf0917ed085..4be7942c2c64 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -29,6 +29,7 @@ class GlobalValue;
 class MachineBasicBlock;
 class MachineInstr;
 class MachineRegisterInfo;
+class MCCFIInstruction;
 class MDNode;
 class ModuleSlotTracker;
 class TargetMachine;
@@ -250,6 +251,12 @@ public:
   static void printStackObjectReference(raw_ostream &OS, unsigned FrameIndex,
                                         bool IsFixed, StringRef Name);
 
+  /// Print the offset with explicit +/- signs.
+  static void printOperandOffset(raw_ostream &OS, int64_t Offset);
+
+  /// Print an IRSlotNumber.
+  static void printIRSlotNumber(raw_ostream &OS, int Slot);
+
   /// Print the MachineOperand to \p os.
   /// Providing a valid \p TRI and \p IntrinsicInfo results in a more
   /// target-specific printing. If \p TRI and \p IntrinsicInfo are null, the
diff --git a/include/llvm/CodeGen/RuntimeLibcalls.def b/include/llvm/CodeGen/RuntimeLibcalls.def
index e042ae982e86..7695e9d782ef 100644
--- a/include/llvm/CodeGen/RuntimeLibcalls.def
+++ b/include/llvm/CodeGen/RuntimeLibcalls.def
@@ -165,6 +165,8 @@ HANDLE_LIBCALL(SINCOS_F64, nullptr)
 HANDLE_LIBCALL(SINCOS_F80, nullptr)
 HANDLE_LIBCALL(SINCOS_F128, nullptr)
 HANDLE_LIBCALL(SINCOS_PPCF128, nullptr)
+HANDLE_LIBCALL(SINCOS_STRET_F32, nullptr)
+HANDLE_LIBCALL(SINCOS_STRET_F64, nullptr)
 HANDLE_LIBCALL(POW_F32, "powf")
 HANDLE_LIBCALL(POW_F64, "pow")
 HANDLE_LIBCALL(POW_F80, "powl")
@@ -334,6 +336,7 @@ HANDLE_LIBCALL(O_PPCF128, "__gcc_qunord")
 HANDLE_LIBCALL(MEMCPY, "memcpy")
 HANDLE_LIBCALL(MEMMOVE, "memmove")
 HANDLE_LIBCALL(MEMSET, "memset")
+HANDLE_LIBCALL(BZERO, nullptr)
 
 // Element-wise unordered-atomic memory of different sizes
 HANDLE_LIBCALL(MEMCPY_ELEMENT_UNORDERED_ATOMIC_1, "__llvm_memcpy_element_unordered_atomic_1")
diff --git a/include/llvm/CodeGen/SDNodeProperties.td b/include/llvm/CodeGen/SDNodeProperties.td
new file mode 100644
index 000000000000..83bbab2fdc8d
--- /dev/null
+++ b/include/llvm/CodeGen/SDNodeProperties.td
@@ -0,0 +1,34 @@
+//===- SDNodeProperties.td - Common code for DAG isels ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class SDNodeProperty;
+
+// Selection DAG Pattern Operations
+class SDPatternOperator {
+  list<SDNodeProperty> Properties = [];
+}
+
+//===----------------------------------------------------------------------===//
+// Selection DAG Node Properties.
+//
+// Note: These are hard coded into tblgen.
+//
+def SDNPCommutative : SDNodeProperty;   // X op Y == Y op X
+def SDNPAssociative : SDNodeProperty;   // (X op Y) op Z == X op (Y op Z)
+def SDNPHasChain    : SDNodeProperty;   // R/W chain operand and result
+def SDNPOutGlue     : SDNodeProperty;   // Write a flag result
+def SDNPInGlue      : SDNodeProperty;   // Read a flag operand
+def SDNPOptInGlue   : SDNodeProperty;   // Optionally read a flag operand
+def SDNPMayStore    : SDNodeProperty;   // May write to memory, sets 'mayStore'.
+def SDNPMayLoad     : SDNodeProperty;   // May read memory, sets 'mayLoad'.
+def SDNPSideEffect  : SDNodeProperty;   // Sets 'HasUnmodelledSideEffects'.
+def SDNPMemOperand  : SDNodeProperty;   // Touches memory, has assoc MemOperand
+def SDNPVariadic    : SDNodeProperty;   // Node has variable arguments.
+def SDNPWantRoot    : SDNodeProperty;   // ComplexPattern gets the root of match
+def SDNPWantParent  : SDNodeProperty;   // ComplexPattern gets the parent
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 7de2e766d521..522c2f1b2cb2 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -189,8 +189,8 @@ public:
   inline bool isUndef() const;
   inline unsigned getMachineOpcode() const;
   inline const DebugLoc &getDebugLoc() const;
-  inline void dump() const;
-  inline void dumpr() const;
+  inline void dump(const SelectionDAG *G = nullptr) const;
+  inline void dumpr(const SelectionDAG *G = nullptr) const;
 
   /// Return true if this operand (which must be a chain) reaches the
   /// specified operand without crossing any side-effecting instructions.
@@ -1089,12 +1089,12 @@ inline const DebugLoc &SDValue::getDebugLoc() const {
   return Node->getDebugLoc();
 }
 
-inline void SDValue::dump() const {
-  return Node->dump();
+inline void SDValue::dump(const SelectionDAG *G) const {
+  return Node->dump(G);
 }
 
-inline void SDValue::dumpr() const {
-  return Node->dumpr();
+inline void SDValue::dumpr(const SelectionDAG *G) const {
+  return Node->dumpr(G);
 }
 
 // Define inline functions from the SDUse class.
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index 0fa19d09e776..380e3b19dc80 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -824,8 +824,8 @@ public:
   /// also combined within this function. Currently, the minimum size check is
   /// performed in findJumpTable() in SelectionDAGBuiler and
   /// getEstimatedNumberOfCaseClusters() in BasicTTIImpl.
-  bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases,
-                              uint64_t Range) const {
+  virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases,
+                                      uint64_t Range) const {
     const bool OptForSize = SI->getParent()->getParent()->optForSize();
     const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize);
     const unsigned MaxJumpTableSize =
@@ -1276,7 +1276,7 @@ public:
   }
 
   /// Return lower limit for number of blocks in a jump table.
-  unsigned getMinimumJumpTableEntries() const;
+  virtual unsigned getMinimumJumpTableEntries() const;
 
   /// Return lower limit of the density in a jump table.
   unsigned getMinimumJumpTableDensity(bool OptForSize) const;
@@ -2429,7 +2429,7 @@ private:
     PromoteToType;
 
   /// Stores the name each libcall.
-  const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL];
+  const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
 
   /// The ISD::CondCode that should be used to test the result of each of the
   /// comparison libcall against zero.
@@ -2438,6 +2438,9 @@ private:
   /// Stores the CallingConv that should be used for each libcall.
   CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
 
+  /// Set default libcall names and calling conventions.
+  void InitLibcalls(const Triple &TT);
+
 protected:
   /// Return true if the extension represented by \p I is free.
   /// \pre \p I is a sign, zero, or fp extension and
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index e9178e03fa8a..3cec58383f87 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -165,6 +165,29 @@ struct BaseAddress {
   uint64_t SectionIndex;
 };
 
+/// Represents a unit's contribution to the string offsets table.
+struct StrOffsetsContributionDescriptor {
+  uint64_t Base = 0;
+  uint64_t Size = 0;
+  /// Format and version.
+  DWARFFormParams FormParams = {0, 0, dwarf::DwarfFormat::DWARF32};
+
+  StrOffsetsContributionDescriptor(uint64_t Base, uint64_t Size,
+                                   uint8_t Version, dwarf::DwarfFormat Format)
+      : Base(Base), Size(Size), FormParams({Version, 0, Format}) {}
+
+  uint8_t getVersion() const { return FormParams.Version; }
+  dwarf::DwarfFormat getFormat() const { return FormParams.Format; }
+  uint8_t getDwarfOffsetByteSize() const {
+    return FormParams.getDwarfOffsetByteSize();
+  }
+  /// Determine whether a contribution to the string offsets table is
+  /// consistent with the relevant section size and that its length is
+  /// a multiple of the size of one of its entries.
+  Optional<StrOffsetsContributionDescriptor>
+  validateContributionSize(DWARFDataExtractor &DA);
+};
+
 class DWARFUnit {
   DWARFContext &Context;
   /// Section containing this DWARFUnit.
@@ -176,7 +199,6 @@ class DWARFUnit {
   const DWARFSection &LineSection;
   StringRef StringSection;
   const DWARFSection &StringOffsetSection;
-  uint64_t StringOffsetSectionBase = 0;
   const DWARFSection *AddrOffsetSection;
   uint32_t AddrOffsetSectionBase = 0;
   bool isLittleEndian;
@@ -185,6 +207,9 @@ class DWARFUnit {
 
   // Version, address size, and DWARF format.
   DWARFFormParams FormParams;
+  /// Start, length, and DWARF format of the unit's contribution to the string
+  /// offsets table (DWARF v5).
+  Optional<StrOffsetsContributionDescriptor> StringOffsetsTableContribution;
 
   uint32_t Offset;
   uint32_t Length;
@@ -195,10 +220,40 @@ class DWARFUnit {
   /// The compile unit debug information entry items.
   std::vector<DWARFDebugInfoEntry> DieArray;
 
-  /// Map from range's start address to end address and corresponding DIE.
-  /// IntervalMap does not support range removal, as a result, we use the
-  /// std::map::upper_bound for address range lookup.
-  std::map<uint64_t, std::pair<uint64_t, DWARFDie>> AddrDieMap;
+  /// The vector of inlined subroutine DIEs that we can map directly to from
+  /// their subprogram below.
+  std::vector<DWARFDie> InlinedSubroutineDIEs;
+
+  /// A type representing a subprogram DIE and a map (built using a sorted
+  /// vector) into that subprogram's inlined subroutine DIEs.
+  struct SubprogramDIEAddrInfo {
+    DWARFDie SubprogramDIE;
+
+    uint64_t SubprogramBasePC;
+
+    /// A vector sorted to allow mapping from a relative PC to the inlined
+    /// subroutine DIE with the most specific address range covering that PC.
+    ///
+    /// The PCs are relative to the `SubprogramBasePC`.
+    ///
+    /// The vector is sorted in ascending order of the first int which
+    /// represents the relative PC for an interval in the map. The second int
+    /// represents the index into the `InlinedSubroutineDIEs` vector of the DIE
+    /// that interval maps to. An index of '-1` indicates an empty mapping. The
+    /// interval covered is from the `.first` relative PC to the next entry's
+    /// `.first` relative PC.
+    std::vector<std::pair<uint32_t, int32_t>> InlinedSubroutineDIEAddrMap;
+  };
+
+  /// Vector of the subprogram DIEs and their subroutine address maps.
+  std::vector<SubprogramDIEAddrInfo> SubprogramDIEAddrInfos;
+
+  /// A vector sorted to allow mapping from a PC to the subprogram DIE (and
+  /// associated addr map) index. Subprograms with overlapping PC ranges aren't
+  /// supported here. Nothing will crash, but the mapping may be inaccurate.
+  /// This vector may also contain "empty" ranges marked by an address with
+  /// a DIE index of '-1'.
+  std::vector<std::pair<uint64_t, int64_t>> SubprogramDIEAddrMap;
 
   using die_iterator_range =
       iterator_range<std::vector<DWARFDebugInfoEntry>::iterator>;
@@ -219,6 +274,21 @@ protected:
   /// Size in bytes of the unit header.
   virtual uint32_t getHeaderSize() const { return getVersion() <= 4 ? 11 : 12; }
 
+  /// Find the unit's contribution to the string offsets table and determine its
+  /// length and form. The given offset is expected to be derived from the unit
+  /// DIE's DW_AT_str_offsets_base attribute.
+  Optional<StrOffsetsContributionDescriptor>
+  determineStringOffsetsTableContribution(DWARFDataExtractor &DA,
+                                          uint64_t Offset);
+
+  /// Find the unit's contribution to the string offsets table and determine its
+  /// length and form. The given offset is expected to be 0 in a dwo file or,
+  /// in a dwp file, the start of the unit's contribution to the string offsets
+  /// table section (as determined by the index table).
+  Optional<StrOffsetsContributionDescriptor>
+  determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA,
+                                             uint64_t Offset);
+
 public:
   DWARFUnit(DWARFContext &Context, const DWARFSection &Section,
             const DWARFDebugAbbrev *DA, const DWARFSection *RS, StringRef SS,
@@ -242,9 +312,6 @@ public:
     AddrOffsetSectionBase = Base;
   }
 
-  /// Recursively update address to Die map.
-  void updateAddressDieMap(DWARFDie Die);
-
   void setRangesSection(const DWARFSection *RS, uint32_t Base) {
     RangeSection = RS;
     RangeSectionBase = Base;
@@ -272,6 +339,10 @@ public:
   uint32_t getNextUnitOffset() const { return Offset + Length + 4; }
   uint32_t getLength() const { return Length; }
 
+  const Optional<StrOffsetsContributionDescriptor> &
+  getStringOffsetsTableContribution() const {
+    return StringOffsetsTableContribution;
+  }
   const DWARFFormParams &getFormParams() const { return FormParams; }
   uint16_t getVersion() const { return FormParams.Version; }
   dwarf::DwarfFormat getFormat() const { return FormParams.Format; }
@@ -281,6 +352,16 @@ public:
     return FormParams.getDwarfOffsetByteSize();
   }
 
+  uint8_t getDwarfStringOffsetsByteSize() const {
+    assert(StringOffsetsTableContribution);
+    return StringOffsetsTableContribution->getDwarfOffsetByteSize();
+  }
+
+  uint64_t getStringOffsetsBase() const {
+    assert(StringOffsetsTableContribution);
+    return StringOffsetsTableContribution->Base;
+  }
+
   const DWARFAbbreviationDeclarationSet *getAbbreviations() const;
 
   uint8_t getUnitType() const { return UnitType; }
@@ -426,6 +507,9 @@ private:
   /// parseDWO - Parses .dwo file for current compile unit. Returns true if
   /// it was actually constructed.
   bool parseDWO();
+
+  void buildSubprogramDIEAddrMap();
+  void buildInlinedSubroutineDIEAddrMap(SubprogramDIEAddrInfo &SPInfo);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/FuzzMutate/IRMutator.h b/include/llvm/FuzzMutate/IRMutator.h
index 65ab871db0ef..9aa9d6d6a4bc 100644
--- a/include/llvm/FuzzMutate/IRMutator.h
+++ b/include/llvm/FuzzMutate/IRMutator.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_FUZZMUTATE_IRMUTATOR_H
 #define LLVM_FUZZMUTATE_IRMUTATOR_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/FuzzMutate/OpDescriptor.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -74,7 +75,8 @@ public:
 class InjectorIRStrategy : public IRMutationStrategy {
   std::vector<fuzzerop::OpDescriptor> Operations;
 
-  fuzzerop::OpDescriptor chooseOperation(Value *Src, RandomIRBuilder &IB);
+  Optional<fuzzerop::OpDescriptor> chooseOperation(Value *Src,
+                                                   RandomIRBuilder &IB);
 
 public:
   InjectorIRStrategy(std::vector<fuzzerop::OpDescriptor> &&Operations)
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index e811ae5e215a..79c56abe1c37 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -248,6 +248,12 @@ public:
   /// pgo data.
   Optional<uint64_t> getEntryCount() const;
 
+  /// Return true if the function is annotated with profile data.
+  ///
+  /// Presence of entry counts from a profile run implies the function has
+  /// profile annotations.
+  bool hasProfileData() const { return getEntryCount().hasValue(); }
+
   /// Returns the set of GUIDs that needs to be imported to the function for
   /// sample PGO, to enable the same inlines as the profiled optimized binary.
   DenseSet<GlobalValue::GUID> getImportGUIDs() const;
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 07de0568cab0..a2a1f26292ce 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 include "llvm/CodeGen/ValueTypes.td"
+include "llvm/CodeGen/SDNodeProperties.td"
 
 //===----------------------------------------------------------------------===//
 //  Properties we keep track of for intrinsics.
@@ -264,16 +265,17 @@ def llvm_vararg_ty     : LLVMType<isVoid>;   // this means vararg here
 //    intrinsic.
 //  * Properties can be set to describe the behavior of the intrinsic.
 //
-class SDPatternOperator;
 class Intrinsic<list<LLVMType> ret_types,
                 list<LLVMType> param_types = [],
-                list<IntrinsicProperty> properties = [],
-                string name = ""> : SDPatternOperator {
+                list<IntrinsicProperty> intr_properties = [],
+                string name = "",
+                list<SDNodeProperty> sd_properties = []> : SDPatternOperator {
   string LLVMName = name;
   string TargetPrefix = "";   // Set to a prefix for target-specific intrinsics.
   list<LLVMType> RetTypes = ret_types;
   list<LLVMType> ParamTypes = param_types;
-  list<IntrinsicProperty> IntrProperties = properties;
+  list<IntrinsicProperty> IntrProperties = intr_properties;
+  let Properties = sd_properties;
 
   bit isTarget = 0;
 }
diff --git a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
index 14f0c48266f0..d794535700e5 100644
--- a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
+++ b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
@@ -148,10 +148,14 @@ public:
   /// incremental build.
   void setCacheDir(std::string Path) { CacheOptions.Path = std::move(Path); }
 
-  /// Cache policy: interval (seconds) between two prune of the cache. Set to a
-  /// negative value (default) to disable pruning. A value of 0 will be ignored.
+  /// Cache policy: interval (seconds) between two prunes of the cache. Set to a
+  /// negative value to disable pruning. A value of 0 will be ignored.
   void setCachePruningInterval(int Interval) {
-    if (Interval)
+    if (Interval == 0)
+      return;
+    if(Interval < 0)
+      CacheOptions.Policy.Interval.reset();
+    else
       CacheOptions.Policy.Interval = std::chrono::seconds(Interval);
   }
 
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 234762f36dd4..c538c46fc072 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -165,7 +165,8 @@ protected:
   const char *ZeroDirective;
 
   /// This directive allows emission of an ascii string with the standard C
-  /// escape characters embedded into it.  Defaults to "\t.ascii\t"
+  /// escape characters embedded into it.  If a target doesn't support this, it
+  /// can be set to null. Defaults to "\t.ascii\t"
   const char *AsciiDirective;
 
   /// If not null, this allows for special handling of zero terminated strings
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 481d96724d40..a82051700708 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -95,6 +95,17 @@ public:
   virtual void prettyPrintAsm(MCInstPrinter &InstPrinter, raw_ostream &OS,
                               const MCInst &Inst, const MCSubtargetInfo &STI);
 
+  virtual void emitDwarfFileDirective(StringRef Directive);
+
+  /// Update streamer for a new active section.
+  ///
+  /// This is called by PopSection and SwitchSection, if the current
+  /// section changes.
+  virtual void changeSection(const MCSection *CurSection, MCSection *Section,
+                             const MCExpr *SubSection, raw_ostream &OS);
+
+  virtual void emitValue(const MCExpr *Value);
+
   virtual void finish();
 };
 
diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h
index 5bb1a3fca3d1..71951d83f3cc 100644
--- a/include/llvm/Object/Wasm.h
+++ b/include/llvm/Object/Wasm.h
@@ -43,9 +43,9 @@ public:
   };
 
   WasmSymbol(StringRef Name, SymbolType Type, uint32_t Section,
-             uint32_t ElementIndex, uint32_t ImportIndex = 0)
+             uint32_t ElementIndex, uint32_t FunctionType = 0)
       : Name(Name), Type(Type), Section(Section), ElementIndex(ElementIndex),
-        ImportIndex(ImportIndex) {}
+        FunctionType(FunctionType) {}
 
   StringRef Name;
   SymbolType Type;
@@ -55,8 +55,18 @@ public:
   // Index into either the function or global index space.
   uint32_t ElementIndex;
 
-  // For imports, the index into the import table
-  uint32_t ImportIndex;
+  // For function, the type index
+  uint32_t FunctionType;
+
+  // Symbols can be both exported and imported (in the case of the weakly
+  // defined symbol).  In this the import index is stored as AltIndex.
+  uint32_t AltIndex = 0;
+  bool HasAltIndex = false;
+
+  void setAltIndex(uint32_t Index) {
+    HasAltIndex = true;
+    AltIndex = Index;
+  }
 
   bool isFunction() const {
     return Type == WasmSymbol::SymbolType::FUNCTION_IMPORT ||
@@ -91,8 +101,7 @@ public:
 
   void print(raw_ostream &Out) const {
     Out << "Name=" << Name << ", Type=" << static_cast<int>(Type)
-        << ", Flags=" << Flags << " ElemIndex=" << ElementIndex
-        << ", ImportIndex=" << ImportIndex;
+        << ", Flags=" << Flags << " ElemIndex=" << ElementIndex;
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/include/llvm/Support/CachePruning.h b/include/llvm/Support/CachePruning.h
index c577e9b8b631..327c7df4570f 100644
--- a/include/llvm/Support/CachePruning.h
+++ b/include/llvm/Support/CachePruning.h
@@ -27,8 +27,9 @@ template <typename T> class Expected;
 struct CachePruningPolicy {
   /// The pruning interval. This is intended to be used to avoid scanning the
   /// directory too often. It does not impact the decision of which file to
-  /// prune. A value of 0 forces the scan to occur.
-  std::chrono::seconds Interval = std::chrono::seconds(1200);
+  /// prune. A value of 0 forces the scan to occur. A value of None disables
+  /// pruning.
+  llvm::Optional<std::chrono::seconds> Interval = std::chrono::seconds(1200);
 
   /// The expiration for a file. When a file hasn't been accessed for Expiration
   /// seconds, it is removed from the cache. A value of 0 disables the
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index 59c93f15d7b8..7b849fdb8670 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -15,6 +15,7 @@
 #define LLVM_SUPPORT_MEMORYBUFFER_H
 
 #include "llvm-c/Types.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/CBindingWrapping.h"
@@ -47,6 +48,9 @@ protected:
 
   void init(const char *BufStart, const char *BufEnd,
             bool RequiresNullTerminator);
+
+  static constexpr bool Writable = false;
+
 public:
   MemoryBuffer(const MemoryBuffer &) = delete;
   MemoryBuffer &operator=(const MemoryBuffer &) = delete;
@@ -119,12 +123,6 @@ public:
   static std::unique_ptr<MemoryBuffer>
   getNewMemBuffer(size_t Size, StringRef BufferName = "");
 
-  /// Allocate a new MemoryBuffer of the specified size that is not initialized.
-  /// Note that the caller should initialize the memory allocated by this
-  /// method. The memory is owned by the MemoryBuffer object.
-  static std::unique_ptr<MemoryBuffer>
-  getNewUninitMemBuffer(size_t Size, const Twine &BufferName = "");
-
   /// Read all of stdin into a file buffer, and return it.
   static ErrorOr<std::unique_ptr<MemoryBuffer>> getSTDIN();
 
@@ -156,6 +154,62 @@ public:
   MemoryBufferRef getMemBufferRef() const;
 };
 
+/// This class is an extension of MemoryBuffer, which allows writing to the
+/// underlying contents.  It only supports creation methods that are guaranteed
+/// to produce a writable buffer.  For example, mapping a file read-only is not
+/// supported.
+class WritableMemoryBuffer : public MemoryBuffer {
+protected:
+  WritableMemoryBuffer() = default;
+
+  static constexpr bool Writable = true;
+
+public:
+  using MemoryBuffer::getBuffer;
+  using MemoryBuffer::getBufferEnd;
+  using MemoryBuffer::getBufferStart;
+
+  // const_cast is well-defined here, because the underlying buffer is
+  // guaranteed to have been initialized with a mutable buffer.
+  char *getBufferStart() {
+    return const_cast<char *>(MemoryBuffer::getBufferStart());
+  }
+  char *getBufferEnd() {
+    return const_cast<char *>(MemoryBuffer::getBufferEnd());
+  }
+  MutableArrayRef<char> getBuffer() {
+    return {getBufferStart(), getBufferEnd()};
+  }
+
+  static ErrorOr<std::unique_ptr<WritableMemoryBuffer>>
+  getFile(const Twine &Filename, int64_t FileSize = -1,
+          bool IsVolatile = false);
+
+  /// Map a subrange of the specified file as a WritableMemoryBuffer.
+  static ErrorOr<std::unique_ptr<WritableMemoryBuffer>>
+  getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset,
+               bool IsVolatile = false);
+
+  /// Allocate a new MemoryBuffer of the specified size that is not initialized.
+  /// Note that the caller should initialize the memory allocated by this
+  /// method. The memory is owned by the MemoryBuffer object.
+  static std::unique_ptr<WritableMemoryBuffer>
+  getNewUninitMemBuffer(size_t Size, const Twine &BufferName = "");
+
+private:
+  // Hide these base class factory function so one can't write
+  //   WritableMemoryBuffer::getXXX()
+  // and be surprised that he got a read-only Buffer.
+  using MemoryBuffer::getFileAsStream;
+  using MemoryBuffer::getFileOrSTDIN;
+  using MemoryBuffer::getMemBuffer;
+  using MemoryBuffer::getMemBufferCopy;
+  using MemoryBuffer::getNewMemBuffer;
+  using MemoryBuffer::getOpenFile;
+  using MemoryBuffer::getOpenFileSlice;
+  using MemoryBuffer::getSTDIN;
+};
+
 class MemoryBufferRef {
   StringRef Buffer;
   StringRef Identifier;
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 83b097a199d6..674c78a11695 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -549,9 +549,9 @@ inline QuotingType needsQuotes(StringRef S) {
       // range.
       if (C <= 0x1F)
         return QuotingType::Double;
-      // C1 control block (0x80 - 0x9F) is excluded from the allowed character
-      // range.
-      if (C >= 0x80 && C <= 0x9F)
+
+      // Always double quote UTF-8.
+      if ((C & 0x80) != 0)
         return QuotingType::Double;
 
       // The character is not safe, at least simple quoting needed.
@@ -1725,7 +1725,7 @@ template <typename T> struct StdMapStringCustomMappingTraitsImpl {
   template <> struct ScalarTraits<Type> {                                      \
     static void output(const Type &Value, void *ctx, raw_ostream &Out);        \
     static StringRef input(StringRef Scalar, void *ctxt, Type &Value);         \
-    static QuotingType mustQuote(StringRef) { return MustQuote; }         \
+    static QuotingType mustQuote(StringRef) { return MustQuote; }              \
   };                                                                           \
   }                                                                            \
   }
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index 5421b22462ae..97442f9a7849 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -24,6 +24,7 @@
 
 namespace llvm {
 
+class Function;
 class GlobalValue;
 class MachineModuleInfo;
 class Mangler;
@@ -38,6 +39,7 @@ class PassManagerBuilder;
 class Target;
 class TargetIntrinsicInfo;
 class TargetIRAnalysis;
+class TargetTransformInfo;
 class TargetLoweringObjectFile;
 class TargetPassConfig;
 class TargetSubtargetInfo;
@@ -204,7 +206,13 @@ public:
   /// This is used to construct the new pass manager's target IR analysis pass,
   /// set up appropriately for this target machine. Even the old pass manager
   /// uses this to answer queries about the IR.
-  virtual TargetIRAnalysis getTargetIRAnalysis();
+  TargetIRAnalysis getTargetIRAnalysis();
+
+  /// \brief Return a TargetTransformInfo for a given function.
+  ///
+  /// The returned TargetTransformInfo is specialized to the subtarget
+  /// corresponding to \p F.
+  virtual TargetTransformInfo getTargetTransformInfo(const Function &F);
 
   /// Allow the target to modify the pass manager, e.g. by calling
   /// PassManagerBuilder::addExtension.
@@ -280,11 +288,11 @@ protected: // Can only create subclasses.
   void initAsmInfo();
 
 public:
-  /// \brief Get a TargetIRAnalysis implementation for the target.
+  /// \brief Get a TargetTransformInfo implementation for the target.
   ///
-  /// This analysis will produce a TTI result which uses the common code
-  /// generator to answer queries about the IR.
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  /// The TTI returned uses the common code generator to answer queries about
+  /// the IR.
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   /// Create a pass configuration object to be used by addPassToEmitX methods
   /// for generating a pipeline of CodeGen passes.
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 06caa21d288c..f6162377b8b7 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -286,32 +286,6 @@ class SDCallSeqEnd<list<SDTypeConstraint> constraints> :
         SDTypeProfile<0, 2, constraints>;
 
 //===----------------------------------------------------------------------===//
-// Selection DAG Node Properties.
-//
-// Note: These are hard coded into tblgen.
-//
-class SDNodeProperty;
-def SDNPCommutative : SDNodeProperty;   // X op Y == Y op X
-def SDNPAssociative : SDNodeProperty;   // (X op Y) op Z == X op (Y op Z)
-def SDNPHasChain    : SDNodeProperty;   // R/W chain operand and result
-def SDNPOutGlue     : SDNodeProperty;   // Write a flag result
-def SDNPInGlue      : SDNodeProperty;   // Read a flag operand
-def SDNPOptInGlue   : SDNodeProperty;   // Optionally read a flag operand
-def SDNPMayStore    : SDNodeProperty;   // May write to memory, sets 'mayStore'.
-def SDNPMayLoad     : SDNodeProperty;   // May read memory, sets 'mayLoad'.
-def SDNPSideEffect  : SDNodeProperty;   // Sets 'HasUnmodelledSideEffects'.
-def SDNPMemOperand  : SDNodeProperty;   // Touches memory, has assoc MemOperand
-def SDNPVariadic    : SDNodeProperty;   // Node has variable arguments.
-def SDNPWantRoot    : SDNodeProperty;   // ComplexPattern gets the root of match
-def SDNPWantParent  : SDNodeProperty;   // ComplexPattern gets the parent
-
-//===----------------------------------------------------------------------===//
-// Selection DAG Pattern Operations
-class SDPatternOperator {
-  list<SDNodeProperty> Properties = [];
-}
-
-//===----------------------------------------------------------------------===//
 // Selection DAG Node definitions.
 //
 class SDNode<string opcode, SDTypeProfile typeprof,
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index cd6b770f76ac..b1e13f17aef1 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -133,7 +133,7 @@ ModulePass *createAddressSanitizerModulePass(bool CompileKernel = false,
 FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0,
                                         bool Recover = false);
 
-FunctionPass *createHWAddressSanitizerPass();
+FunctionPass *createHWAddressSanitizerPass(bool Recover = false);
 
 // Insert ThreadSanitizer (race detection) instrumentation
 FunctionPass *createThreadSanitizerPass();
diff --git a/include/llvm/Transforms/Utils/CallPromotionUtils.h b/include/llvm/Transforms/Utils/CallPromotionUtils.h
index e0bf85781d81..6e8ece723638 100644
--- a/include/llvm/Transforms/Utils/CallPromotionUtils.h
+++ b/include/llvm/Transforms/Utils/CallPromotionUtils.h
@@ -29,13 +29,23 @@ namespace llvm {
 bool isLegalToPromote(CallSite CS, Function *Callee,
                       const char **FailureReason = nullptr);
 
+/// Promote the given indirect call site to unconditionally call \p Callee.
+///
+/// This function promotes the given call site, returning the direct call or
+/// invoke instruction. If the function type of the call site doesn't match that
+/// of the callee, bitcast instructions are inserted where appropriate. If \p
+/// RetBitCast is non-null, it will be used to store the return value bitcast,
+/// if created.
+Instruction *promoteCall(CallSite CS, Function *Callee,
+                         CastInst **RetBitCast = nullptr);
+
 /// Promote the given indirect call site to conditionally call \p Callee.
 ///
 /// This function creates an if-then-else structure at the location of the call
-/// site. The original call site is promoted and moved into the "then" block. A
-/// clone of the indirect call site is placed in the "else" block and returned.
-/// If \p BranchWeights is non-null, it will be used to set !prof metadata on
-/// the new conditional branch.
+/// site. The original call site is moved into the "else" block. A clone of the
+/// indirect call site is promoted, placed in the "then" block, and returned. If
+/// \p BranchWeights is non-null, it will be used to set !prof metadata on the
+/// new conditional branch.
 Instruction *promoteCallWithIfThenElse(CallSite CS, Function *Callee,
                                        MDNode *BranchWeights = nullptr);
 
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index 382942be64a1..d8b07c4f54da 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -61,7 +61,7 @@ module LLVM_BinaryFormat {
     textual header "BinaryFormat/ELFRelocs/SystemZ.def"
     textual header "BinaryFormat/ELFRelocs/x86_64.def"
     textual header "BinaryFormat/ELFRelocs/WebAssembly.def"
-    textual header "BinaryFormat/WasmRelocs/WebAssembly.def"
+    textual header "BinaryFormat/WasmRelocs.def"
 }
 
 module LLVM_Config { requires cplusplus umbrella "Config" module * { export * } }
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index dd2db1e5b27b..55df66714178 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -133,9 +133,9 @@ ModRefInfo AAResults::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) {
 }
 
 ModRefInfo AAResults::getModRefInfo(Instruction *I, ImmutableCallSite Call) {
-  // We may have two calls
+  // We may have two calls.
   if (auto CS = ImmutableCallSite(I)) {
-    // Check if the two calls modify the same memory
+    // Check if the two calls modify the same memory.
     return getModRefInfo(CS, Call);
   } else if (I->isFenceLike()) {
     // If this is a fence, just return ModRef.
@@ -179,6 +179,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS,
 
   if (onlyAccessesArgPointees(MRB) || onlyAccessesInaccessibleOrArgMem(MRB)) {
     bool DoesAlias = false;
+    bool IsMustAlias = true;
     ModRefInfo AllArgsMask = ModRefInfo::NoModRef;
     if (doesAccessArgPointees(MRB)) {
       for (auto AI = CS.arg_begin(), AE = CS.arg_end(); AI != AE; ++AI) {
@@ -193,6 +194,8 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS,
           DoesAlias = true;
           AllArgsMask = unionModRef(AllArgsMask, ArgMask);
         }
+        // Conservatively clear IsMustAlias unless only MustAlias is found.
+        IsMustAlias &= (ArgAlias == MustAlias);
       }
     }
     // Return NoModRef if no alias found with any argument.
@@ -200,6 +203,8 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS,
       return ModRefInfo::NoModRef;
     // Logical & between other AA analyses and argument analysis.
     Result = intersectModRef(Result, AllArgsMask);
+    // If only MustAlias found above, set Must bit.
+    Result = IsMustAlias ? setMust(Result) : clearMust(Result);
   }
 
   // If Loc is a constant memory location, the call definitely could not
@@ -251,6 +256,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,
   if (onlyAccessesArgPointees(CS2B)) {
     ModRefInfo R = ModRefInfo::NoModRef;
     if (doesAccessArgPointees(CS2B)) {
+      bool IsMustAlias = true;
       for (auto I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) {
         const Value *Arg = *I;
         if (!Arg->getType()->isPointerTy())
@@ -274,10 +280,19 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,
         ModRefInfo ModRefCS1 = getModRefInfo(CS1, CS2ArgLoc);
         ArgMask = intersectModRef(ArgMask, ModRefCS1);
 
+        // Conservatively clear IsMustAlias unless only MustAlias is found.
+        IsMustAlias &= isMustSet(ModRefCS1);
+
         R = intersectModRef(unionModRef(R, ArgMask), Result);
-        if (R == Result)
+        if (R == Result) {
+          // On early exit, not all args were checked, cannot set Must.
+          if (I + 1 != E)
+            IsMustAlias = false;
           break;
+        }
       }
+      // If Alias found and only MustAlias found above, set Must bit.
+      R = IsMustAlias ? setMust(R) : clearMust(R);
     }
     return R;
   }
@@ -287,6 +302,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,
   if (onlyAccessesArgPointees(CS1B)) {
     ModRefInfo R = ModRefInfo::NoModRef;
     if (doesAccessArgPointees(CS1B)) {
+      bool IsMustAlias = true;
       for (auto I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) {
         const Value *Arg = *I;
         if (!Arg->getType()->isPointerTy())
@@ -303,9 +319,18 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1,
             (isRefSet(ArgModRefCS1) && isModSet(ModRefCS2)))
           R = intersectModRef(unionModRef(R, ArgModRefCS1), Result);
 
-        if (R == Result)
+        // Conservatively clear IsMustAlias unless only MustAlias is found.
+        IsMustAlias &= isMustSet(ModRefCS2);
+
+        if (R == Result) {
+          // On early exit, not all args were checked, cannot set Must.
+          if (I + 1 != E)
+            IsMustAlias = false;
           break;
+        }
       }
+      // If Alias found and only MustAlias found above, set Must bit.
+      R = IsMustAlias ? setMust(R) : clearMust(R);
     }
     return R;
   }
@@ -353,9 +378,13 @@ ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
 
   // If the load address doesn't alias the given address, it doesn't read
   // or write the specified memory.
-  if (Loc.Ptr && !alias(MemoryLocation::get(L), Loc))
-    return ModRefInfo::NoModRef;
-
+  if (Loc.Ptr) {
+    AliasResult AR = alias(MemoryLocation::get(L), Loc);
+    if (AR == NoAlias)
+      return ModRefInfo::NoModRef;
+    if (AR == MustAlias)
+      return ModRefInfo::MustRef;
+  }
   // Otherwise, a load just reads.
   return ModRefInfo::Ref;
 }
@@ -367,15 +396,20 @@ ModRefInfo AAResults::getModRefInfo(const StoreInst *S,
     return ModRefInfo::ModRef;
 
   if (Loc.Ptr) {
+    AliasResult AR = alias(MemoryLocation::get(S), Loc);
     // If the store address cannot alias the pointer in question, then the
     // specified memory cannot be modified by the store.
-    if (!alias(MemoryLocation::get(S), Loc))
+    if (AR == NoAlias)
       return ModRefInfo::NoModRef;
 
     // If the pointer is a pointer to constant memory, then it could not have
     // been modified by this store.
     if (pointsToConstantMemory(Loc))
       return ModRefInfo::NoModRef;
+
+    // If the store address aliases the pointer as must alias, set Must.
+    if (AR == MustAlias)
+      return ModRefInfo::MustMod;
   }
 
   // Otherwise, a store just writes.
@@ -393,15 +427,20 @@ ModRefInfo AAResults::getModRefInfo(const FenceInst *S, const MemoryLocation &Lo
 ModRefInfo AAResults::getModRefInfo(const VAArgInst *V,
                                     const MemoryLocation &Loc) {
   if (Loc.Ptr) {
+    AliasResult AR = alias(MemoryLocation::get(V), Loc);
     // If the va_arg address cannot alias the pointer in question, then the
     // specified memory cannot be accessed by the va_arg.
-    if (!alias(MemoryLocation::get(V), Loc))
+    if (AR == NoAlias)
       return ModRefInfo::NoModRef;
 
     // If the pointer is a pointer to constant memory, then it could not have
     // been modified by this va_arg.
     if (pointsToConstantMemory(Loc))
       return ModRefInfo::NoModRef;
+
+    // If the va_arg aliases the pointer as must alias, set Must.
+    if (AR == MustAlias)
+      return ModRefInfo::MustModRef;
   }
 
   // Otherwise, a va_arg reads and writes.
@@ -440,9 +479,17 @@ ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX,
   if (isStrongerThanMonotonic(CX->getSuccessOrdering()))
     return ModRefInfo::ModRef;
 
-  // If the cmpxchg address does not alias the location, it does not access it.
-  if (Loc.Ptr && !alias(MemoryLocation::get(CX), Loc))
-    return ModRefInfo::NoModRef;
+  if (Loc.Ptr) {
+    AliasResult AR = alias(MemoryLocation::get(CX), Loc);
+    // If the cmpxchg address does not alias the location, it does not access
+    // it.
+    if (AR == NoAlias)
+      return ModRefInfo::NoModRef;
+
+    // If the cmpxchg address aliases the pointer as must alias, set Must.
+    if (AR == MustAlias)
+      return ModRefInfo::MustModRef;
+  }
 
   return ModRefInfo::ModRef;
 }
@@ -453,9 +500,17 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW,
   if (isStrongerThanMonotonic(RMW->getOrdering()))
     return ModRefInfo::ModRef;
 
-  // If the atomicrmw address does not alias the location, it does not access it.
-  if (Loc.Ptr && !alias(MemoryLocation::get(RMW), Loc))
-    return ModRefInfo::NoModRef;
+  if (Loc.Ptr) {
+    AliasResult AR = alias(MemoryLocation::get(RMW), Loc);
+    // If the atomicrmw address does not alias the location, it does not access
+    // it.
+    if (AR == NoAlias)
+      return ModRefInfo::NoModRef;
+
+    // If the atomicrmw address aliases the pointer as must alias, set Must.
+    if (AR == MustAlias)
+      return ModRefInfo::MustModRef;
+  }
 
   return ModRefInfo::ModRef;
 }
@@ -493,6 +548,8 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
 
   unsigned ArgNo = 0;
   ModRefInfo R = ModRefInfo::NoModRef;
+  bool MustAlias = true;
+  // Set flag only if no May found and all operands processed.
   for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end();
        CI != CE; ++CI, ++ArgNo) {
     // Only look at the no-capture or byval pointer arguments.  If this
@@ -503,11 +560,14 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
          ArgNo < CS.getNumArgOperands() && !CS.isByValArgument(ArgNo)))
       continue;
 
+    AliasResult AR = alias(MemoryLocation(*CI), MemoryLocation(Object));
     // If this is a no-capture pointer argument, see if we can tell that it
     // is impossible to alias the pointer we're checking.  If not, we have to
     // assume that the call could touch the pointer, even though it doesn't
     // escape.
-    if (isNoAlias(MemoryLocation(*CI), MemoryLocation(Object)))
+    if (AR != MustAlias)
+      MustAlias = false;
+    if (AR == NoAlias)
       continue;
     if (CS.doesNotAccessMemory(ArgNo))
       continue;
@@ -515,9 +575,10 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
       R = ModRefInfo::Ref;
       continue;
     }
+    // Not returning MustModRef since we have not seen all the arguments.
     return ModRefInfo::ModRef;
   }
-  return R;
+  return MustAlias ? setMust(R) : clearMust(R);
 }
 
 /// canBasicBlockModify - Return true if it is possible for execution of the
diff --git a/lib/Analysis/AliasAnalysisEvaluator.cpp b/lib/Analysis/AliasAnalysisEvaluator.cpp
index 423acf739f58..f737cecc43d1 100644
--- a/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -31,9 +31,13 @@ static cl::opt<bool> PrintPartialAlias("print-partial-aliases", cl::ReallyHidden
 static cl::opt<bool> PrintMustAlias("print-must-aliases", cl::ReallyHidden);
 
 static cl::opt<bool> PrintNoModRef("print-no-modref", cl::ReallyHidden);
-static cl::opt<bool> PrintMod("print-mod", cl::ReallyHidden);
 static cl::opt<bool> PrintRef("print-ref", cl::ReallyHidden);
+static cl::opt<bool> PrintMod("print-mod", cl::ReallyHidden);
 static cl::opt<bool> PrintModRef("print-modref", cl::ReallyHidden);
+static cl::opt<bool> PrintMust("print-must", cl::ReallyHidden);
+static cl::opt<bool> PrintMustRef("print-mustref", cl::ReallyHidden);
+static cl::opt<bool> PrintMustMod("print-mustmod", cl::ReallyHidden);
+static cl::opt<bool> PrintMustModRef("print-mustmodref", cl::ReallyHidden);
 
 static cl::opt<bool> EvalAAMD("evaluate-aa-metadata", cl::ReallyHidden);
 
@@ -262,6 +266,25 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
                            F.getParent());
         ++ModRefCount;
         break;
+      case ModRefInfo::Must:
+        PrintModRefResults("Must", PrintMust, I, Pointer, F.getParent());
+        ++MustCount;
+        break;
+      case ModRefInfo::MustMod:
+        PrintModRefResults("Just Mod (MustAlias)", PrintMustMod, I, Pointer,
+                           F.getParent());
+        ++MustModCount;
+        break;
+      case ModRefInfo::MustRef:
+        PrintModRefResults("Just Ref (MustAlias)", PrintMustRef, I, Pointer,
+                           F.getParent());
+        ++MustRefCount;
+        break;
+      case ModRefInfo::MustModRef:
+        PrintModRefResults("Both ModRef (MustAlias)", PrintMustModRef, I,
+                           Pointer, F.getParent());
+        ++MustModRefCount;
+        break;
       }
     }
   }
@@ -288,6 +311,25 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
         PrintModRefResults("Both ModRef", PrintModRef, *C, *D, F.getParent());
         ++ModRefCount;
         break;
+      case ModRefInfo::Must:
+        PrintModRefResults("Must", PrintMust, *C, *D, F.getParent());
+        ++MustCount;
+        break;
+      case ModRefInfo::MustMod:
+        PrintModRefResults("Just Mod (MustAlias)", PrintMustMod, *C, *D,
+                           F.getParent());
+        ++MustModCount;
+        break;
+      case ModRefInfo::MustRef:
+        PrintModRefResults("Just Ref (MustAlias)", PrintMustRef, *C, *D,
+                           F.getParent());
+        ++MustRefCount;
+        break;
+      case ModRefInfo::MustModRef:
+        PrintModRefResults("Both ModRef (MustAlias)", PrintMustModRef, *C, *D,
+                           F.getParent());
+        ++MustModRefCount;
+        break;
       }
     }
   }
@@ -325,7 +367,8 @@ AAEvaluator::~AAEvaluator() {
   }
 
   // Display the summary for mod/ref analysis
-  int64_t ModRefSum = NoModRefCount + ModCount + RefCount + ModRefCount;
+  int64_t ModRefSum = NoModRefCount + RefCount + ModCount + ModRefCount +
+                      MustCount + MustRefCount + MustModCount + MustModRefCount;
   if (ModRefSum == 0) {
     errs() << "  Alias Analysis Mod/Ref Evaluator Summary: no "
               "mod/ref!\n";
@@ -339,10 +382,22 @@ AAEvaluator::~AAEvaluator() {
     PrintPercent(RefCount, ModRefSum);
     errs() << "  " << ModRefCount << " mod & ref responses ";
     PrintPercent(ModRefCount, ModRefSum);
+    errs() << "  " << MustCount << " must responses ";
+    PrintPercent(MustCount, ModRefSum);
+    errs() << "  " << MustModCount << " must mod responses ";
+    PrintPercent(MustModCount, ModRefSum);
+    errs() << "  " << MustRefCount << " must ref responses ";
+    PrintPercent(MustRefCount, ModRefSum);
+    errs() << "  " << MustModRefCount << " must mod & ref responses ";
+    PrintPercent(MustModRefCount, ModRefSum);
     errs() << "  Alias Analysis Evaluator Mod/Ref Summary: "
            << NoModRefCount * 100 / ModRefSum << "%/"
            << ModCount * 100 / ModRefSum << "%/" << RefCount * 100 / ModRefSum
-           << "%/" << ModRefCount * 100 / ModRefSum << "%\n";
+           << "%/" << ModRefCount * 100 / ModRefSum << "%/"
+           << MustCount * 100 / ModRefSum << "%/"
+           << MustRefCount * 100 / ModRefSum << "%/"
+           << MustModCount * 100 / ModRefSum << "%/"
+           << MustModRefCount * 100 / ModRefSum << "%\n";
   }
 }
 
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 81b9f842249e..537813b6b752 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -781,6 +781,7 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
     // Optimistically assume that call doesn't touch Object and check this
     // assumption in the following loop.
     ModRefInfo Result = ModRefInfo::NoModRef;
+    bool IsMustAlias = true;
 
     unsigned OperandNo = 0;
     for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end();
@@ -802,7 +803,8 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
       // is impossible to alias the pointer we're checking.
       AliasResult AR =
           getBestAAResults().alias(MemoryLocation(*CI), MemoryLocation(Object));
-
+      if (AR != MustAlias)
+        IsMustAlias = false;
       // Operand doesnt alias 'Object', continue looking for other aliases
       if (AR == NoAlias)
         continue;
@@ -818,13 +820,20 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
         continue;
       }
       // This operand aliases 'Object' and call reads and writes into it.
+      // Setting ModRef will not yield an early return below, MustAlias is not
+      // used further.
       Result = ModRefInfo::ModRef;
       break;
     }
 
+    // No operand aliases, reset Must bit. Add below if at least one aliases
+    // and all aliases found are MustAlias.
+    if (isNoModRef(Result))
+      IsMustAlias = false;
+
     // Early return if we improved mod ref information
     if (!isModAndRefSet(Result))
-      return Result;
+      return IsMustAlias ? setMust(Result) : clearMust(Result);
   }
 
   // If the CallSite is to malloc or calloc, we can assume that it doesn't
diff --git a/lib/Analysis/CFGPrinter.cpp b/lib/Analysis/CFGPrinter.cpp
index a85af6c9c93f..fb261755e5d1 100644
--- a/lib/Analysis/CFGPrinter.cpp
+++ b/lib/Analysis/CFGPrinter.cpp
@@ -82,7 +82,7 @@ PreservedAnalyses CFGOnlyViewerPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
-static void writeCFGToDotFile(Function &F) {
+static void writeCFGToDotFile(Function &F, bool CFGOnly = false) {
   std::string Filename = ("cfg." + F.getName() + ".dot").str();
   errs() << "Writing '" << Filename << "'...";
 
@@ -90,7 +90,7 @@ static void writeCFGToDotFile(Function &F) {
   raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
 
   if (!EC)
-    WriteGraph(File, (const Function*)&F);
+    WriteGraph(File, (const Function*)&F, CFGOnly);
   else
     errs() << "  error opening file for writing!";
   errs() << "\n";
@@ -134,7 +134,7 @@ namespace {
     }
 
     bool runOnFunction(Function &F) override {
-      writeCFGToDotFile(F);
+      writeCFGToDotFile(F, /*CFGOnly=*/true);
       return false;
     }
     void print(raw_ostream &OS, const Module* = nullptr) const override {}
@@ -152,7 +152,7 @@ INITIALIZE_PASS(CFGOnlyPrinterLegacyPass, "dot-cfg-only",
 
 PreservedAnalyses CFGOnlyPrinterPass::run(Function &F,
                                           FunctionAnalysisManager &AM) {
-  writeCFGToDotFile(F);
+  writeCFGToDotFile(F, /*CFGOnly=*/true);
   return PreservedAnalyses::all();
 }
 
diff --git a/lib/Analysis/GlobalsModRef.cpp b/lib/Analysis/GlobalsModRef.cpp
index 23109c67e5c3..daee93267f56 100644
--- a/lib/Analysis/GlobalsModRef.cpp
+++ b/lib/Analysis/GlobalsModRef.cpp
@@ -85,12 +85,17 @@ class GlobalsAAResult::FunctionInfo {
   /// The bit that flags that this function may read any global. This is
   /// chosen to mix together with ModRefInfo bits.
   /// FIXME: This assumes ModRefInfo lattice will remain 4 bits!
+  /// It overlaps with ModRefInfo::Must bit!
+  /// FunctionInfo.getModRefInfo() masks out everything except ModRef so
+  /// this remains correct, but the Must info is lost.
   enum { MayReadAnyGlobal = 4 };
 
   /// Checks to document the invariants of the bit packing here.
-  static_assert((MayReadAnyGlobal & static_cast<int>(ModRefInfo::ModRef)) == 0,
+  static_assert((MayReadAnyGlobal & static_cast<int>(ModRefInfo::MustModRef)) ==
+                    0,
                 "ModRef and the MayReadAnyGlobal flag bits overlap.");
-  static_assert(((MayReadAnyGlobal | static_cast<int>(ModRefInfo::ModRef)) >>
+  static_assert(((MayReadAnyGlobal |
+                  static_cast<int>(ModRefInfo::MustModRef)) >>
                  AlignedMapPointerTraits::NumLowBitsAvailable) == 0,
                 "Insufficient low bits to store our flag and ModRef info.");
 
@@ -125,14 +130,22 @@ public:
     return *this;
   }
 
+  /// This method clears MayReadAnyGlobal bit added by GlobalsAAResult to return
+  /// the corresponding ModRefInfo. It must align in functionality with
+  /// clearMust().
+  ModRefInfo globalClearMayReadAnyGlobal(int I) const {
+    return ModRefInfo((I & static_cast<int>(ModRefInfo::ModRef)) |
+                      static_cast<int>(ModRefInfo::NoModRef));
+  }
+
   /// Returns the \c ModRefInfo info for this function.
   ModRefInfo getModRefInfo() const {
-    return ModRefInfo(Info.getInt() & static_cast<int>(ModRefInfo::ModRef));
+    return globalClearMayReadAnyGlobal(Info.getInt());
   }
 
   /// Adds new \c ModRefInfo for this function to its state.
   void addModRefInfo(ModRefInfo NewMRI) {
-    Info.setInt(Info.getInt() | static_cast<int>(NewMRI));
+    Info.setInt(Info.getInt() | static_cast<int>(setMust(NewMRI)));
   }
 
   /// Returns whether this function may read any global variable, and we don't
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index fba96c8976a6..b0cb29203a5a 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -249,8 +249,6 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool visitCastInst(CastInst &I);
   bool visitUnaryInstruction(UnaryInstruction &I);
   bool visitCmpInst(CmpInst &I);
-  bool visitAnd(BinaryOperator &I);
-  bool visitOr(BinaryOperator &I);
   bool visitSub(BinaryOperator &I);
   bool visitBinaryOperator(BinaryOperator &I);
   bool visitLoad(LoadInst &I);
@@ -363,6 +361,7 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
 void CallAnalyzer::disableLoadElimination() {
   if (EnableLoadElimination) {
     Cost += LoadEliminationCost;
+    LoadEliminationCost = 0;
     EnableLoadElimination = false;
   }
 }
@@ -700,6 +699,22 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
   disableSROA(I.getOperand(0));
 
+  // If this is a floating-point cast, and the target says this operation
+  // is expensive, this may eventually become a library call. Treat the cost
+  // as such.
+  switch (I.getOpcode()) {
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+    if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
+      Cost += InlineConstants::CallPenalty;
+  default:
+    break;
+  }
+
   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 }
 
@@ -1004,34 +1019,6 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) {
   return false;
 }
 
-bool CallAnalyzer::visitOr(BinaryOperator &I) {
-  // This is necessary because the generic simplify instruction only works if
-  // both operands are constants.
-  for (unsigned i = 0; i < 2; ++i) {
-    if (ConstantInt *C = dyn_cast_or_null<ConstantInt>(
-            SimplifiedValues.lookup(I.getOperand(i))))
-      if (C->isAllOnesValue()) {
-        SimplifiedValues[&I] = C;
-        return true;
-      }
-  }
-  return Base::visitOr(I);
-}
-
-bool CallAnalyzer::visitAnd(BinaryOperator &I) {
-  // This is necessary because the generic simplify instruction only works if
-  // both operands are constants.
-  for (unsigned i = 0; i < 2; ++i) {
-    if (ConstantInt *C = dyn_cast_or_null<ConstantInt>(
-            SimplifiedValues.lookup(I.getOperand(i))))
-      if (C->isZero()) {
-        SimplifiedValues[&I] = C;
-        return true;
-      }
-  }
-  return Base::visitAnd(I);
-}
-
 bool CallAnalyzer::visitSub(BinaryOperator &I) {
   // Try to handle a special case: we can fold computing the difference of two
   // constant-related pointers.
@@ -1061,23 +1048,38 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {
 
 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-  auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) {
-    Value *SimpleV = nullptr;
-    if (auto FI = dyn_cast<FPMathOperator>(&I))
-      SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1],
-                                FI->getFastMathFlags(), DL);
-    else
-      SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL);
-    return dyn_cast_or_null<Constant>(SimpleV);
-  };
+  Constant *CLHS = dyn_cast<Constant>(LHS);
+  if (!CLHS)
+    CLHS = SimplifiedValues.lookup(LHS);
+  Constant *CRHS = dyn_cast<Constant>(RHS);
+  if (!CRHS)
+    CRHS = SimplifiedValues.lookup(RHS);
+
+  Value *SimpleV = nullptr;
+  if (auto FI = dyn_cast<FPMathOperator>(&I))
+    SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
+                              CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
+  else
+    SimpleV =
+        SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
 
-  if (simplifyInstruction(I, Evaluate))
+  if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
+    SimplifiedValues[&I] = C;
+
+  if (SimpleV)
     return true;
 
   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
   disableSROA(LHS);
   disableSROA(RHS);
 
+  // If the instruction is floating point, and the target says this operation
+  // is expensive, this may eventually become a library call. Treat the cost
+  // as such.
+  if (I.getType()->isFloatingPointTy() &&
+      TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
+    Cost += InlineConstants::CallPenalty;
+
   return false;
 }
 
@@ -1097,7 +1099,7 @@ bool CallAnalyzer::visitLoad(LoadInst &I) {
   // by any stores or calls, this load is likely to be redundant and can be
   // eliminated.
   if (EnableLoadElimination &&
-      !LoadAddrSet.insert(I.getPointerOperand()).second) {
+      !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
     LoadEliminationCost += InlineConstants::InstrCost;
     return true;
   }
@@ -1547,17 +1549,6 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
       ++NumVectorInstructions;
 
-    // If the instruction is floating point, and the target says this operation
-    // is expensive or the function has the "use-soft-float" attribute, this may
-    // eventually become a library call. Treat the cost as such.
-    if (I->getType()->isFloatingPointTy()) {
-      // If the function has the "use-soft-float" attribute, mark it as
-      // expensive.
-      if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
-          (F.getFnAttribute("use-soft-float").getValueAsString() == "true"))
-        Cost += InlineConstants::CallPenalty;
-    }
-
     // If the instruction simplified to a constant, there is no cost to this
     // instruction. Visit the instructions using our InstVisitor to account for
     // all of the per-instruction logic. The visit tree returns true if we
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index ed8e5e8cc489..e141d6c58b65 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1107,77 +1107,6 @@ static unsigned getAddressSpaceOperand(Value *I) {
   return -1;
 }
 
-// TODO:This API can be improved by using the permutation of given width as the
-// accesses are entered into the map.
-bool llvm::sortLoadAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
-                           ScalarEvolution &SE,
-                           SmallVectorImpl<Value *> &Sorted,
-                           SmallVectorImpl<unsigned> *Mask) {
-  SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
-  OffValPairs.reserve(VL.size());
-  Sorted.reserve(VL.size());
-
-  // Walk over the pointers, and map each of them to an offset relative to
-  // first pointer in the array.
-  Value *Ptr0 = getPointerOperand(VL[0]);
-  const SCEV *Scev0 = SE.getSCEV(Ptr0);
-  Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
-  PointerType *PtrTy = dyn_cast<PointerType>(Ptr0->getType());
-  uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
-
-  for (auto *Val : VL) {
-    // The only kind of access we care about here is load.
-    if (!isa<LoadInst>(Val))
-      return false;
-
-    Value *Ptr = getPointerOperand(Val);
-    assert(Ptr && "Expected value to have a pointer operand.");
-    // If a pointer refers to a different underlying object, bail - the
-    // pointers are by definition incomparable.
-    Value *CurrObj = GetUnderlyingObject(Ptr, DL);
-    if (CurrObj != Obj0)
-      return false;
-
-    const SCEVConstant *Diff =
-        dyn_cast<SCEVConstant>(SE.getMinusSCEV(SE.getSCEV(Ptr), Scev0));
-    // The pointers may not have a constant offset from each other, or SCEV
-    // may just not be smart enough to figure out they do. Regardless,
-    // there's nothing we can do.
-    if (!Diff || static_cast<unsigned>(Diff->getAPInt().abs().getSExtValue()) >
-                     (VL.size() - 1) * Size)
-      return false;
-
-    OffValPairs.emplace_back(Diff->getAPInt().getSExtValue(), Val);
-  }
-  SmallVector<unsigned, 4> UseOrder(VL.size());
-  for (unsigned i = 0; i < VL.size(); i++) {
-    UseOrder[i] = i;
-  }
-
-  // Sort the memory accesses and keep the order of their uses in UseOrder.
-  std::sort(UseOrder.begin(), UseOrder.end(),
-            [&OffValPairs](unsigned Left, unsigned Right) {
-            return OffValPairs[Left].first < OffValPairs[Right].first;
-            });
-
-  for (unsigned i = 0; i < VL.size(); i++)
-    Sorted.emplace_back(OffValPairs[UseOrder[i]].second);
-
-  // Sort UseOrder to compute the Mask.
-  if (Mask) {
-    Mask->reserve(VL.size());
-    for (unsigned i = 0; i < VL.size(); i++)
-      Mask->emplace_back(i);
-    std::sort(Mask->begin(), Mask->end(),
-              [&UseOrder](unsigned Left, unsigned Right) {
-              return UseOrder[Left] < UseOrder[Right];
-              });
-  }
-
-  return true;
-}
-
-
 /// Returns true if the memory operations \p A and \p B are consecutive.
 bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
                                ScalarEvolution &SE, bool CheckType) {
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index a6c590126c2f..bb7bf967994c 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -647,6 +647,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
 
       // Ok, this store might clobber the query pointer.  Check to see if it is
       // a must alias: in this case, we want to return this as a def.
+      // FIXME: Use ModRefInfo::Must bit from getModRefInfo call above.
       MemoryLocation StoreLoc = MemoryLocation::get(SI);
 
       // If we found a pointer, check if it could be the same as our pointer.
@@ -690,7 +691,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     // If necessary, perform additional analysis.
     if (isModAndRefSet(MR))
       MR = AA.callCapturesBefore(Inst, MemLoc, &DT, &OBB);
-    switch (MR) {
+    switch (clearMust(MR)) {
     case ModRefInfo::NoModRef:
       // If the call has no effect on the queried pointer, just ignore it.
       continue;
@@ -919,6 +920,14 @@ void MemoryDependenceResults::getNonLocalPointerDependency(
     Instruction *QueryInst, SmallVectorImpl<NonLocalDepResult> &Result) {
   const MemoryLocation Loc = MemoryLocation::get(QueryInst);
   bool isLoad = isa<LoadInst>(QueryInst);
+  return getNonLocalPointerDependencyFrom(QueryInst, Loc, isLoad, Result);
+}
+
+void MemoryDependenceResults::getNonLocalPointerDependencyFrom(
+    Instruction *QueryInst,
+    const MemoryLocation &Loc,
+    bool isLoad,
+    SmallVectorImpl<NonLocalDepResult> &Result) {
   BasicBlock *FromBB = QueryInst->getParent();
   assert(FromBB);
 
@@ -1118,21 +1127,15 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
   // If we already have a cache entry for this CacheKey, we may need to do some
   // work to reconcile the cache entry and the current query.
   if (!Pair.second) {
-    if (CacheInfo->Size < Loc.Size) {
-      // The query's Size is greater than the cached one. Throw out the
-      // cached data and proceed with the query at the greater size.
+    if (CacheInfo->Size != Loc.Size) {
+      // The query's Size differs from the cached one. Throw out the
+      // cached data and proceed with the query at the new size.
       CacheInfo->Pair = BBSkipFirstBlockPair();
       CacheInfo->Size = Loc.Size;
       for (auto &Entry : CacheInfo->NonLocalDeps)
         if (Instruction *Inst = Entry.getResult().getInst())
           RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);
       CacheInfo->NonLocalDeps.clear();
-    } else if (CacheInfo->Size > Loc.Size) {
-      // This query's Size is less than the cached one. Conservatively restart
-      // the query using the greater size.
-      return getNonLocalPointerDepFromBB(
-          QueryInst, Pointer, Loc.getWithNewSize(CacheInfo->Size), isLoad,
-          StartBB, Result, Visited, SkipFirstBlock);
     }
 
     // If the query's AATags are inconsistent with the cached one,
diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp
index 8fe190e8bcf8..6e9368c49d65 100644
--- a/lib/Analysis/MemorySSA.cpp
+++ b/lib/Analysis/MemorySSA.cpp
@@ -192,8 +192,6 @@ template <> struct DenseMapInfo<MemoryLocOrCall> {
   }
 };
 
-enum class Reorderability { Always, IfNoAlias, Never };
-
 } // end namespace llvm
 
 /// This does one-way checks to see if Use could theoretically be hoisted above
@@ -202,22 +200,16 @@ enum class Reorderability { Always, IfNoAlias, Never };
 /// This assumes that, for the purposes of MemorySSA, Use comes directly after
 /// MayClobber, with no potentially clobbering operations in between them.
 /// (Where potentially clobbering ops are memory barriers, aliased stores, etc.)
-static Reorderability getLoadReorderability(const LoadInst *Use,
-                                            const LoadInst *MayClobber) {
+static bool areLoadsReorderable(const LoadInst *Use,
+                                const LoadInst *MayClobber) {
   bool VolatileUse = Use->isVolatile();
   bool VolatileClobber = MayClobber->isVolatile();
   // Volatile operations may never be reordered with other volatile operations.
   if (VolatileUse && VolatileClobber)
-    return Reorderability::Never;
-
-  // The lang ref allows reordering of volatile and non-volatile operations.
-  // Whether an aliasing nonvolatile load and volatile load can be reordered,
-  // though, is ambiguous. Because it may not be best to exploit this ambiguity,
-  // we only allow volatile/non-volatile reordering if the volatile and
-  // non-volatile operations don't alias.
-  Reorderability Result = VolatileUse || VolatileClobber
-                              ? Reorderability::IfNoAlias
-                              : Reorderability::Always;
+    return false;
+  // Otherwise, volatile doesn't matter here. From the language reference:
+  // 'optimizers may change the order of volatile operations relative to
+  // non-volatile operations.'"
 
   // If a load is seq_cst, it cannot be moved above other loads. If its ordering
   // is weaker, it can be moved above other loads. We just need to be sure that
@@ -229,9 +221,7 @@ static Reorderability getLoadReorderability(const LoadInst *Use,
   bool SeqCstUse = Use->getOrdering() == AtomicOrdering::SequentiallyConsistent;
   bool MayClobberIsAcquire = isAtLeastOrStrongerThan(MayClobber->getOrdering(),
                                                      AtomicOrdering::Acquire);
-  if (SeqCstUse || MayClobberIsAcquire)
-    return Reorderability::Never;
-  return Result;
+  return !(SeqCstUse || MayClobberIsAcquire);
 }
 
 static bool instructionClobbersQuery(MemoryDef *MD,
@@ -265,18 +255,9 @@ static bool instructionClobbersQuery(MemoryDef *MD,
     return isModOrRefSet(I);
   }
 
-  if (auto *DefLoad = dyn_cast<LoadInst>(DefInst)) {
-    if (auto *UseLoad = dyn_cast<LoadInst>(UseInst)) {
-      switch (getLoadReorderability(UseLoad, DefLoad)) {
-      case Reorderability::Always:
-        return false;
-      case Reorderability::Never:
-        return true;
-      case Reorderability::IfNoAlias:
-        return !AA.isNoAlias(UseLoc, MemoryLocation::get(DefLoad));
-      }
-    }
-  }
+  if (auto *DefLoad = dyn_cast<LoadInst>(DefInst))
+    if (auto *UseLoad = dyn_cast<LoadInst>(UseInst))
+      return !areLoadsReorderable(UseLoad, DefLoad);
 
   return isModSet(AA.getModRefInfo(DefInst, UseLoc));
 }
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index d54fb700200d..10badd89a4a8 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -454,7 +454,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     std::unique_ptr<BlockFrequencyInfo> BFIPtr;
     if (GetBFICallback)
       BFI = GetBFICallback(F);
-    else if (F.getEntryCount().hasValue()) {
+    else if (F.hasProfileData()) {
       LoopInfo LI{DominatorTree(const_cast<Function &>(F))};
       BranchProbabilityInfo BPI{F, LI};
       BFIPtr = llvm::make_unique<BlockFrequencyInfo>(F, BPI, LI);
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index 671744f93fb8..347d093b0f61 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -115,42 +115,62 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
   return FunctionCount && isHotCount(FunctionCount.getValue());
 }
 
-/// Returns true if the function's entry or total call edge count is hot.
+/// Returns true if the function contains hot code. This can include a hot
+/// function entry count, hot basic block, or (in the case of Sample PGO)
+/// hot total call edge count.
 /// If it returns false, it either means it is not hot or it is unknown
-/// whether it is hot or not (for example, no profile data is available).
-bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F) {
+/// (for example, no profile data is available).
+bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F,
+                                                  BlockFrequencyInfo &BFI) {
   if (!F || !computeSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
     if (isHotCount(FunctionCount.getValue()))
       return true;
 
-  uint64_t TotalCallCount = 0;
+  if (hasSampleProfile()) {
+    uint64_t TotalCallCount = 0;
+    for (const auto &BB : *F)
+      for (const auto &I : BB)
+        if (isa<CallInst>(I) || isa<InvokeInst>(I))
+          if (auto CallCount = getProfileCount(&I, nullptr))
+            TotalCallCount += CallCount.getValue();
+    if (isHotCount(TotalCallCount))
+      return true;
+  }
   for (const auto &BB : *F)
-    for (const auto &I : BB)
-      if (isa<CallInst>(I) || isa<InvokeInst>(I))
-        if (auto CallCount = getProfileCount(&I, nullptr))
-          TotalCallCount += CallCount.getValue();
-  return isHotCount(TotalCallCount);
+    if (isHotBB(&BB, &BFI))
+      return true;
+  return false;
 }
 
-/// Returns true if the function's entry and total call edge count is cold.
+/// Returns true if the function only contains cold code. This means that
+/// the function entry and blocks are all cold, and (in the case of Sample PGO)
+/// the total call edge count is cold.
 /// If it returns false, it either means it is not cold or it is unknown
-/// whether it is cold or not (for example, no profile data is available).
-bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F) {
+/// (for example, no profile data is available).
+bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
+                                                   BlockFrequencyInfo &BFI) {
   if (!F || !computeSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
     if (!isColdCount(FunctionCount.getValue()))
       return false;
-  
-  uint64_t TotalCallCount = 0;
+
+  if (hasSampleProfile()) {
+    uint64_t TotalCallCount = 0;
+    for (const auto &BB : *F)
+      for (const auto &I : BB)
+        if (isa<CallInst>(I) || isa<InvokeInst>(I))
+          if (auto CallCount = getProfileCount(&I, nullptr))
+            TotalCallCount += CallCount.getValue();
+    if (!isColdCount(TotalCallCount))
+      return false;
+  }
   for (const auto &BB : *F)
-    for (const auto &I : BB) 
-      if (isa<CallInst>(I) || isa<InvokeInst>(I))
-        if (auto CallCount = getProfileCount(&I, nullptr))
-          TotalCallCount += CallCount.getValue();
-  return isColdCount(TotalCallCount);
+    if (!isColdBB(&BB, &BFI))
+      return false;
+  return true;
 }
 
 /// Returns true if the function's entry is a cold. If it returns false, it
@@ -231,7 +251,7 @@ bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS,
   // If there is no profile for the caller, and we know the profile is
   // accurate, we consider the callsite as cold.
   return (hasSampleProfile() &&
-          (CS.getCaller()->getEntryCount() || ProfileSampleAccurate ||
+          (CS.getCaller()->hasProfileData() || ProfileSampleAccurate ||
            CS.getCaller()->hasFnAttribute("profile-sample-accurate")));
 }
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 0b8604187121..2a8088dc4452 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -4368,6 +4368,7 @@ static Optional<BinaryOp> MatchBinaryOp(Value *V, DominatorTree &DT) {
       default:
         break;
       }
+    break;
   }
 
   default:
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index b744cae51ed7..c9e9c6d1a419 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -314,6 +314,10 @@ int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   return Cost;
 }
 
+bool TargetTransformInfo::isOutOfOrder() const {
+  return TTIImpl->isOutOfOrder();
+}
+
 unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
   return TTIImpl->getNumberOfRegisters(Vector);
 }
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index c9ed026a1e33..173db399b9d6 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -544,21 +544,32 @@ static bool matchAccessTags(const MDNode *A, const MDNode *B,
   TBAAStructTagNode TagA(A), TagB(B);
   const MDNode *CommonType = getLeastCommonType(TagA.getAccessType(),
                                                 TagB.getAccessType());
-  if (GenericTag)
-    *GenericTag = createAccessTag(CommonType);
 
   // TODO: We need to check if AccessType of TagA encloses AccessType of
   // TagB to support aggregate AccessType. If yes, return true.
 
   // Climb the type DAG from base type of A to see if we reach base type of B.
   uint64_t OffsetA;
-  if (findAccessType(TagA, TagB.getBaseType(), OffsetA))
-    return OffsetA == TagB.getOffset();
+  if (findAccessType(TagA, TagB.getBaseType(), OffsetA)) {
+    bool SameMemberAccess = OffsetA == TagB.getOffset();
+    if (GenericTag)
+      *GenericTag = SameMemberAccess ? TagB.getNode() :
+                                       createAccessTag(CommonType);
+    return SameMemberAccess;
+  }
 
   // Climb the type DAG from base type of B to see if we reach base type of A.
   uint64_t OffsetB;
-  if (findAccessType(TagB, TagA.getBaseType(), OffsetB))
-    return OffsetB == TagA.getOffset();
+  if (findAccessType(TagB, TagA.getBaseType(), OffsetB)) {
+    bool SameMemberAccess = OffsetB == TagA.getOffset();
+    if (GenericTag)
+      *GenericTag = SameMemberAccess ? TagA.getNode() :
+                                       createAccessTag(CommonType);
+    return SameMemberAccess;
+  }
+
+  if (GenericTag)
+    *GenericTag = createAccessTag(CommonType);
 
   // If the final access types have different roots, they're part of different
   // potentially unrelated type systems, so we must be conservative.
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index fd13dbc1f1e2..a7201ed97350 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -3371,7 +3371,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
   for (auto &RI : FS->refs())
     NameVals.push_back(VE.getValueID(RI.getValue()));
 
-  bool HasProfileData = F.getEntryCount().hasValue();
+  bool HasProfileData = F.hasProfileData();
   for (auto &ECI : FS->calls()) {
     NameVals.push_back(getValueId(ECI.first));
     if (HasProfileData)
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 31037095aa2b..d7995447592c 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2033,6 +2033,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     }
   }
   // else fallthrough
+  LLVM_FALLTHROUGH;
 
   // The MC library also has a right-shift operator, but it isn't consistently
   // signed or unsigned between different targets.
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 07ba5d36cc96..3aeb4910ab10 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -51,7 +51,7 @@ add_llvm_library(LLVMCodeGen
   LiveRangeShrink.cpp
   LiveRegMatrix.cpp
   LiveRegUnits.cpp
-  LiveStackAnalysis.cpp
+  LiveStacks.cpp
   LiveVariables.cpp
   LLVMTargetMachine.cpp
   LocalStackSlotAllocation.cpp
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index c4794380f791..d6f55bba716f 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -352,8 +352,6 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // Clear per function information.
   InsertedInsts.clear();
   PromotedInsts.clear();
-  BFI.reset();
-  BPI.reset();
 
   ModifiedDT = false;
   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
@@ -365,14 +363,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  BPI.reset(new BranchProbabilityInfo(F, *LI));
+  BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
   OptSize = F.optForSize();
 
   ProfileSummaryInfo *PSI =
       getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   if (ProfileGuidedSectionPrefix) {
-    if (PSI->isFunctionHotInCallGraph(&F))
+    if (PSI->isFunctionHotInCallGraph(&F, *BFI))
       F.setSectionPrefix(".hot");
-    else if (PSI->isFunctionColdInCallGraph(&F))
+    else if (PSI->isFunctionColdInCallGraph(&F, *BFI))
       F.setSectionPrefix(".unlikely");
   }
 
@@ -652,13 +652,6 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
   if (SameIncomingValueBBs.count(Pred))
     return true;
 
-  if (!BFI) {
-    Function &F = *BB->getParent();
-    LoopInfo LI{DominatorTree(F)};
-    BPI.reset(new BranchProbabilityInfo(F, LI));
-    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
-  }
-
   BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
   BlockFrequency BBFreq = BFI->getBlockFreq(BB);
 
@@ -3704,7 +3697,7 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
       } else {
         uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
         if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
-          ConstantOffset += CI->getSExtValue()*TypeSize;
+          ConstantOffset += CI->getSExtValue() * TypeSize;
         } else if (TypeSize) {  // Scales of zero don't do anything.
           // We only allow one variable index at the moment.
           if (VariableOperand != -1)
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 87a658be4c29..a3b43c92a7fc 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -835,6 +835,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     case 64:
       ZeroTy = Type::getDoubleTy(Ctx);
       break;
+    case 128:
+      ZeroTy = Type::getFP128Ty(Ctx);
+      break;
     default:
       llvm_unreachable("unexpected floating-point type");
     }
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 1aaf7a0ceef8..86ce4b7a9464 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -28,7 +28,7 @@
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
-#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 92edfb059ad6..77a7aaa95732 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -81,10 +81,9 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T,
   this->OptLevel = OL;
 }
 
-TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(BasicTTIImpl(this, F));
-  });
+TargetTransformInfo
+LLVMTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(BasicTTIImpl(this, F));
 }
 
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStacks.cpp
index b0e58b0e3e5f..80ecfdb7a507 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStacks.cpp
@@ -1,4 +1,4 @@
-//===-- LiveStackAnalysis.cpp - Live Stack Slot Analysis ------------------===//
+//===-- LiveStacks.cpp - Live Stack Slot Analysis -------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 3568f96d2b9a..f91cca6e4e50 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -157,18 +157,14 @@ public:
   void print(const MachineBasicBlock &MBB);
 
   void print(const MachineInstr &MI);
-  void printIRBlockReference(const BasicBlock &BB);
   void printIRValueReference(const Value &V);
   void printStackObjectReference(int FrameIndex);
-  void printOffset(int64_t Offset);
   void print(const MachineInstr &MI, unsigned OpIdx,
              const TargetRegisterInfo *TRI, bool ShouldPrintRegisterTies,
              LLT TypeToPrint, bool PrintDef = true);
   void print(const LLVMContext &Context, const TargetInstrInfo &TII,
              const MachineMemOperand &Op);
   void printSyncScope(const LLVMContext &Context, SyncScope::ID SSID);
-
-  void print(const MCCFIInstruction &CFI, const TargetRegisterInfo *TRI);
 };
 
 } // end namespace llvm
@@ -707,32 +703,6 @@ void MIPrinter::print(const MachineInstr &MI) {
   }
 }
 
-static void printIRSlotNumber(raw_ostream &OS, int Slot) {
-  if (Slot == -1)
-    OS << "<badref>";
-  else
-    OS << Slot;
-}
-
-void MIPrinter::printIRBlockReference(const BasicBlock &BB) {
-  OS << "%ir-block.";
-  if (BB.hasName()) {
-    printLLVMNameWithoutPrefix(OS, BB.getName());
-    return;
-  }
-  const Function *F = BB.getParent();
-  int Slot;
-  if (F == MST.getCurrentFunction()) {
-    Slot = MST.getLocalSlot(&BB);
-  } else {
-    ModuleSlotTracker CustomMST(F->getParent(),
-                                /*ShouldInitializeAllMetadata=*/false);
-    CustomMST.incorporateFunction(*F);
-    Slot = CustomMST.getLocalSlot(&BB);
-  }
-  printIRSlotNumber(OS, Slot);
-}
-
 void MIPrinter::printIRValueReference(const Value &V) {
   if (isa<GlobalValue>(V)) {
     V.printAsOperand(OS, /*PrintType=*/false, MST);
@@ -750,7 +720,7 @@ void MIPrinter::printIRValueReference(const Value &V) {
     printLLVMNameWithoutPrefix(OS, V.getName());
     return;
   }
-  printIRSlotNumber(OS, MST.getLocalSlot(&V));
+  MachineOperand::printIRSlotNumber(OS, MST.getLocalSlot(&V));
 }
 
 void MIPrinter::printStackObjectReference(int FrameIndex) {
@@ -762,16 +732,6 @@ void MIPrinter::printStackObjectReference(int FrameIndex) {
                                             Operand.Name);
 }
 
-void MIPrinter::printOffset(int64_t Offset) {
-  if (Offset == 0)
-    return;
-  if (Offset < 0) {
-    OS << " - " << -Offset;
-    return;
-  }
-  OS << " + " << Offset;
-}
-
 void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
                       const TargetRegisterInfo *TRI,
                       bool ShouldPrintRegisterTies, LLT TypeToPrint,
@@ -787,6 +747,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
     LLVM_FALLTHROUGH;
   case MachineOperand::MO_Register:
   case MachineOperand::MO_CImmediate:
+  case MachineOperand::MO_FPImmediate:
   case MachineOperand::MO_MachineBasicBlock:
   case MachineOperand::MO_ConstantPoolIndex:
   case MachineOperand::MO_TargetIndex:
@@ -795,7 +756,11 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   case MachineOperand::MO_GlobalAddress:
   case MachineOperand::MO_RegisterLiveOut:
   case MachineOperand::MO_Metadata:
-  case MachineOperand::MO_MCSymbol: {
+  case MachineOperand::MO_MCSymbol:
+  case MachineOperand::MO_CFIIndex:
+  case MachineOperand::MO_IntrinsicID:
+  case MachineOperand::MO_Predicate:
+  case MachineOperand::MO_BlockAddress: {
     unsigned TiedOperandIdx = 0;
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
@@ -804,21 +769,9 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
              TiedOperandIdx, TRI, TII);
     break;
   }
-  case MachineOperand::MO_FPImmediate:
-    Op.getFPImm()->printAsOperand(OS, /*PrintType=*/true, MST);
-    break;
   case MachineOperand::MO_FrameIndex:
     printStackObjectReference(Op.getIndex());
     break;
-  case MachineOperand::MO_BlockAddress:
-    OS << "blockaddress(";
-    Op.getBlockAddress()->getFunction()->printAsOperand(OS, /*PrintType=*/false,
-                                                        MST);
-    OS << ", ";
-    printIRBlockReference(*Op.getBlockAddress()->getBasicBlock());
-    OS << ')';
-    printOffset(Op.getOffset());
-    break;
   case MachineOperand::MO_RegisterMask: {
     auto RegMaskInfo = RegisterMaskIds.find(Op.getRegMask());
     if (RegMaskInfo != RegisterMaskIds.end())
@@ -827,28 +780,6 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
       printCustomRegMask(Op.getRegMask(), OS, TRI);
     break;
   }
-  case MachineOperand::MO_CFIIndex: {
-    const MachineFunction &MF = *Op.getParent()->getMF();
-    print(MF.getFrameInstructions()[Op.getCFIIndex()], TRI);
-    break;
-  }
-  case MachineOperand::MO_IntrinsicID: {
-    Intrinsic::ID ID = Op.getIntrinsicID();
-    if (ID < Intrinsic::num_intrinsics)
-      OS << "intrinsic(@" << Intrinsic::getName(ID, None) << ')';
-    else {
-      const MachineFunction &MF = *Op.getParent()->getMF();
-      const TargetIntrinsicInfo *TII = MF.getTarget().getIntrinsicInfo();
-      OS << "intrinsic(@" << TII->getName(ID) << ')';
-    }
-    break;
-  }
-  case MachineOperand::MO_Predicate: {
-    auto Pred = static_cast<CmpInst::Predicate>(Op.getPredicate());
-    OS << (CmpInst::isIntPredicate(Pred) ? "int" : "float") << "pred("
-       << CmpInst::getPredicateName(Pred) << ')';
-    break;
-  }
   }
 }
 
@@ -938,7 +869,7 @@ void MIPrinter::print(const LLVMContext &Context, const TargetInstrInfo &TII,
       break;
     }
   }
-  printOffset(Op.getOffset());
+  MachineOperand::printOperandOffset(OS, Op.getOffset());
   if (Op.getBaseAlignment() != Op.getSize())
     OS << ", align " << Op.getBaseAlignment();
   auto AAInfo = Op.getAAInfo();
@@ -978,118 +909,6 @@ void MIPrinter::printSyncScope(const LLVMContext &Context, SyncScope::ID SSID) {
   }
 }
 
-static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS,
-                             const TargetRegisterInfo *TRI) {
-  int Reg = TRI->getLLVMRegNum(DwarfReg, true);
-  if (Reg == -1) {
-    OS << "<badreg>";
-    return;
-  }
-  OS << printReg(Reg, TRI);
-}
-
-void MIPrinter::print(const MCCFIInstruction &CFI,
-                      const TargetRegisterInfo *TRI) {
-  switch (CFI.getOperation()) {
-  case MCCFIInstruction::OpSameValue:
-    OS << "same_value ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    printCFIRegister(CFI.getRegister(), OS, TRI);
-    break;
-  case MCCFIInstruction::OpRememberState:
-    OS << "remember_state ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    break;
-  case MCCFIInstruction::OpRestoreState:
-    OS << "restore_state ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    break;
-  case MCCFIInstruction::OpOffset:
-    OS << "offset ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    printCFIRegister(CFI.getRegister(), OS, TRI);
-    OS << ", " << CFI.getOffset();
-    break;
-  case MCCFIInstruction::OpDefCfaRegister:
-    OS << "def_cfa_register ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    printCFIRegister(CFI.getRegister(), OS, TRI);
-    break;
-  case MCCFIInstruction::OpDefCfaOffset:
-    OS << "def_cfa_offset ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    OS << CFI.getOffset();
-    break;
-  case MCCFIInstruction::OpDefCfa:
-    OS << "def_cfa ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    printCFIRegister(CFI.getRegister(), OS, TRI);
-    OS << ", " << CFI.getOffset();
-    break;
-  case MCCFIInstruction::OpRelOffset:
-    OS << "rel_offset ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    printCFIRegister(CFI.getRegister(), OS, TRI);
-    OS << ", " << CFI.getOffset();
-    break;
-  case MCCFIInstruction::OpAdjustCfaOffset:
-    OS << "adjust_cfa_offset ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    OS << CFI.getOffset();
-    break;
-  case MCCFIInstruction::OpRestore:
-    OS << "restore ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    printCFIRegister(CFI.getRegister(), OS, TRI);
-    break;
-  case MCCFIInstruction::OpEscape: {
-    OS << "escape ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    if (!CFI.getValues().empty()) {
-      size_t e = CFI.getValues().size() - 1;
-      for (size_t i = 0; i < e; ++i)
-        OS << format("0x%02x", uint8_t(CFI.getValues()[i])) << ", ";
-      OS << format("0x%02x", uint8_t(CFI.getValues()[e])) << ", ";
-    }
-    break;
-  }
-  case MCCFIInstruction::OpUndefined:
-    OS << "undefined ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    printCFIRegister(CFI.getRegister(), OS, TRI);
-    break;
-  case MCCFIInstruction::OpRegister:
-    OS << "register ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    printCFIRegister(CFI.getRegister(), OS, TRI);
-    OS << ", ";
-    printCFIRegister(CFI.getRegister2(), OS, TRI);
-    break;
-  case MCCFIInstruction::OpWindowSave:
-    OS << "window_save ";
-    if (MCSymbol *Label = CFI.getLabel())
-      MachineOperand::printSymbol(OS, *Label);
-    break;
-  default:
-    // TODO: Print the other CFI Operations.
-    OS << "<unserializable cfi operation>";
-    break;
-  }
-}
-
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 4ce689607730..84c808ee7938 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -1235,7 +1235,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
 // When profile is available, we need to handle the triangle-shape CFG.
 static BranchProbability getLayoutSuccessorProbThreshold(
       const MachineBasicBlock *BB) {
-  if (!BB->getParent()->getFunction().getEntryCount())
+  if (!BB->getParent()->getFunction().hasProfileData())
     return BranchProbability(StaticLikelyProb, 100);
   if (BB->succ_size() == 2) {
     const MachineBasicBlock *Succ1 = *BB->succ_begin();
@@ -2178,7 +2178,7 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   // will be merged into the first outer loop chain for which this block is not
   // cold anymore. This needs precise profile data and we only do this when
   // profile data is available.
-  if (F->getFunction().getEntryCount() || ForceLoopColdBlock) {
+  if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
     BlockFrequency LoopFreq(0);
     for (auto LoopPred : L.getHeader()->predecessors())
       if (!L.contains(LoopPred))
@@ -2220,7 +2220,7 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   // for better layout.
   bool RotateLoopWithProfile =
       ForcePreciseRotationCost ||
-      (PreciseRotationCost && F->getFunction().getEntryCount());
+      (PreciseRotationCost && F->getFunction().hasProfileData());
 
   // First check to see if there is an obviously preferable top block for the
   // loop. This will default to the header, but may end up as one of the
diff --git a/lib/CodeGen/MachineOperand.cpp b/lib/CodeGen/MachineOperand.cpp
index d17c481862a1..ec81c6391171 100644
--- a/lib/CodeGen/MachineOperand.cpp
+++ b/lib/CodeGen/MachineOperand.cpp
@@ -380,16 +380,6 @@ static void tryToGetTargetInfo(const MachineOperand &MO,
   }
 }
 
-static void printOffset(raw_ostream &OS, int64_t Offset) {
-  if (Offset == 0)
-    return;
-  if (Offset < 0) {
-    OS << " - " << -Offset;
-    return;
-  }
-  OS << " + " << Offset;
-}
-
 static const char *getTargetIndexName(const MachineFunction &MF, int Index) {
   const auto *TII = MF.getSubtarget().getInstrInfo();
   assert(TII && "expected instruction info");
@@ -412,6 +402,44 @@ static const char *getTargetFlagName(const TargetInstrInfo *TII, unsigned TF) {
   return nullptr;
 }
 
+static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS,
+                             const TargetRegisterInfo *TRI) {
+  if (!TRI) {
+    OS << "%dwarfreg." << DwarfReg;
+    return;
+  }
+
+  int Reg = TRI->getLLVMRegNum(DwarfReg, true);
+  if (Reg == -1) {
+    OS << "<badreg>";
+    return;
+  }
+  OS << printReg(Reg, TRI);
+}
+
+static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
+                                  ModuleSlotTracker &MST) {
+  OS << "%ir-block.";
+  if (BB.hasName()) {
+    printLLVMNameWithoutPrefix(OS, BB.getName());
+    return;
+  }
+  Optional<int> Slot;
+  if (const Function *F = BB.getParent()) {
+    if (F == MST.getCurrentFunction()) {
+      Slot = MST.getLocalSlot(&BB);
+    } else if (const Module *M = F->getParent()) {
+      ModuleSlotTracker CustomMST(M, /*ShouldInitializeAllMetadata=*/false);
+      CustomMST.incorporateFunction(*F);
+      Slot = CustomMST.getLocalSlot(&BB);
+    }
+  }
+  if (Slot)
+    MachineOperand::printIRSlotNumber(OS, *Slot);
+  else
+    OS << "<unknown>";
+}
+
 void MachineOperand::printSubregIdx(raw_ostream &OS, uint64_t Index,
                                     const TargetRegisterInfo *TRI) {
   OS << "%subreg.";
@@ -490,6 +518,125 @@ void MachineOperand::printStackObjectReference(raw_ostream &OS,
     OS << '.' << Name;
 }
 
+void MachineOperand::printOperandOffset(raw_ostream &OS, int64_t Offset) {
+  if (Offset == 0)
+    return;
+  if (Offset < 0) {
+    OS << " - " << -Offset;
+    return;
+  }
+  OS << " + " << Offset;
+}
+
+void MachineOperand::printIRSlotNumber(raw_ostream &OS, int Slot) {
+  if (Slot == -1)
+    OS << "<badref>";
+  else
+    OS << Slot;
+}
+
+static void printCFI(raw_ostream &OS, const MCCFIInstruction &CFI,
+                     const TargetRegisterInfo *TRI) {
+  switch (CFI.getOperation()) {
+  case MCCFIInstruction::OpSameValue:
+    OS << "same_value ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpRememberState:
+    OS << "remember_state ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    break;
+  case MCCFIInstruction::OpRestoreState:
+    OS << "restore_state ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    break;
+  case MCCFIInstruction::OpOffset:
+    OS << "offset ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpDefCfaRegister:
+    OS << "def_cfa_register ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpDefCfaOffset:
+    OS << "def_cfa_offset ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    OS << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpDefCfa:
+    OS << "def_cfa ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpRelOffset:
+    OS << "rel_offset ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpAdjustCfaOffset:
+    OS << "adjust_cfa_offset ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    OS << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpRestore:
+    OS << "restore ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpEscape: {
+    OS << "escape ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    if (!CFI.getValues().empty()) {
+      size_t e = CFI.getValues().size() - 1;
+      for (size_t i = 0; i < e; ++i)
+        OS << format("0x%02x", uint8_t(CFI.getValues()[i])) << ", ";
+      OS << format("0x%02x", uint8_t(CFI.getValues()[e])) << ", ";
+    }
+    break;
+  }
+  case MCCFIInstruction::OpUndefined:
+    OS << "undefined ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpRegister:
+    OS << "register ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", ";
+    printCFIRegister(CFI.getRegister2(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpWindowSave:
+    OS << "window_save ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    break;
+  default:
+    // TODO: Print the other CFI Operations.
+    OS << "<unserializable cfi directive>";
+    break;
+  }
+}
+
 void MachineOperand::print(raw_ostream &OS, const TargetRegisterInfo *TRI,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
   tryToGetTargetInfo(*this, TRI, IntrinsicInfo);
@@ -561,29 +708,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
     break;
   case MachineOperand::MO_FPImmediate:
-    if (getFPImm()->getType()->isFloatTy()) {
-      OS << getFPImm()->getValueAPF().convertToFloat();
-    } else if (getFPImm()->getType()->isHalfTy()) {
-      APFloat APF = getFPImm()->getValueAPF();
-      bool Unused;
-      APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused);
-      OS << "half " << APF.convertToFloat();
-    } else if (getFPImm()->getType()->isFP128Ty()) {
-      APFloat APF = getFPImm()->getValueAPF();
-      SmallString<16> Str;
-      getFPImm()->getValueAPF().toString(Str);
-      OS << "quad " << Str;
-    } else if (getFPImm()->getType()->isX86_FP80Ty()) {
-      APFloat APF = getFPImm()->getValueAPF();
-      OS << "x86_fp80 0xK";
-      APInt API = APF.bitcastToAPInt();
-      OS << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
-                                 /*Upper=*/true);
-      OS << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
-                                 /*Upper=*/true);
-    } else {
-      OS << getFPImm()->getValueAPF().convertToDouble();
-    }
+    getFPImm()->printAsOperand(OS, /*PrintType=*/true, MST);
     break;
   case MachineOperand::MO_MachineBasicBlock:
     OS << printMBBReference(*getMBB());
@@ -606,7 +731,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
   }
   case MachineOperand::MO_ConstantPoolIndex:
     OS << "%const." << getIndex();
-    printOffset(OS, getOffset());
+    printOperandOffset(OS, getOffset());
     break;
   case MachineOperand::MO_TargetIndex: {
     OS << "target-index(";
@@ -615,7 +740,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       if (const auto *TargetIndexName = getTargetIndexName(*MF, getIndex()))
         Name = TargetIndexName;
     OS << Name << ')';
-    printOffset(OS, getOffset());
+    printOperandOffset(OS, getOffset());
     break;
   }
   case MachineOperand::MO_JumpTableIndex:
@@ -623,7 +748,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     break;
   case MachineOperand::MO_GlobalAddress:
     getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST);
-    printOffset(OS, getOffset());
+    printOperandOffset(OS, getOffset());
     break;
   case MachineOperand::MO_ExternalSymbol: {
     StringRef Name = getSymbolName();
@@ -633,16 +758,19 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     } else {
       printLLVMNameWithoutPrefix(OS, Name);
     }
-    printOffset(OS, getOffset());
+    printOperandOffset(OS, getOffset());
     break;
   }
-  case MachineOperand::MO_BlockAddress:
-    OS << '<';
-    getBlockAddress()->printAsOperand(OS, /*PrintType=*/false, MST);
-    if (getOffset())
-      OS << "+" << getOffset();
-    OS << '>';
+  case MachineOperand::MO_BlockAddress: {
+    OS << "blockaddress(";
+    getBlockAddress()->getFunction()->printAsOperand(OS, /*PrintType=*/false,
+                                                     MST);
+    OS << ", ";
+    printIRBlockReference(OS, *getBlockAddress()->getBasicBlock(), MST);
+    OS << ')';
+    MachineOperand::printOperandOffset(OS, getOffset());
     break;
+  }
   case MachineOperand::MO_RegisterMask: {
     OS << "<regmask";
     if (TRI) {
@@ -693,23 +821,27 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
   case MachineOperand::MO_MCSymbol:
     printSymbol(OS, *getMCSymbol());
     break;
-  case MachineOperand::MO_CFIIndex:
-    OS << "<call frame instruction>";
+  case MachineOperand::MO_CFIIndex: {
+    if (const MachineFunction *MF = getMFIfAvailable(*this))
+      printCFI(OS, MF->getFrameInstructions()[getCFIIndex()], TRI);
+    else
+      OS << "<cfi directive>";
     break;
+  }
   case MachineOperand::MO_IntrinsicID: {
     Intrinsic::ID ID = getIntrinsicID();
     if (ID < Intrinsic::num_intrinsics)
-      OS << "<intrinsic:@" << Intrinsic::getName(ID, None) << '>';
+      OS << "intrinsic(@" << Intrinsic::getName(ID, None) << ')';
     else if (IntrinsicInfo)
-      OS << "<intrinsic:@" << IntrinsicInfo->getName(ID) << '>';
+      OS << "intrinsic(@" << IntrinsicInfo->getName(ID) << ')';
     else
-      OS << "<intrinsic:" << ID << '>';
+      OS << "intrinsic(" << ID << ')';
     break;
   }
   case MachineOperand::MO_Predicate: {
     auto Pred = static_cast<CmpInst::Predicate>(getPredicate());
-    OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred")
-       << CmpInst::getPredicateName(Pred) << '>';
+    OS << (CmpInst::isIntPredicate(Pred) ? "int" : "float") << "pred("
+       << CmpInst::getPredicateName(Pred) << ')';
     break;
   }
   }
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index c9fe7681e280..e0cc2ca9a2a2 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -37,7 +37,7 @@
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
diff --git a/lib/CodeGen/README.txt b/lib/CodeGen/README.txt
index 2fcbd1280da4..3318e109155b 100644
--- a/lib/CodeGen/README.txt
+++ b/lib/CodeGen/README.txt
@@ -164,7 +164,7 @@ synthesize the various copy insertion/inspection methods in TargetInstrInfo.
 
 Stack coloring improvements:
 
-1. Do proper LiveStackAnalysis on all stack objects including those which are
+1. Do proper LiveStacks analysis on all stack objects including those which are
    not spill slots.
 2. Reorder objects to fill in gaps between objects.
    e.g. 4, 1, <gap>, 4, 1, 1, 1, <gap>, 4 => 4, 1, 1, 1, 1, 4, 4
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index 6e273277804b..1125d2c62bef 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -21,7 +21,7 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
-#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 186ef577e31d..e492c481a540 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -39,7 +39,7 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
-#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 351e91c932eb..69a879701fae 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -45,7 +45,7 @@
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
-#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f97732c1c49d..17f907eb07e8 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3988,10 +3988,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1))
     return RAND;
   // fold (and (or x, C), D) -> D if (C & D) == D
-  if (N1C && N0.getOpcode() == ISD::OR)
-    if (ConstantSDNode *ORI = isConstOrConstSplat(N0.getOperand(1)))
-      if (N1C->getAPIntValue().isSubsetOf(ORI->getAPIntValue()))
-        return N1;
+  auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
+    return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
+  };
+  if (N0.getOpcode() == ISD::OR &&
+      matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
+    return N1;
   // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
   if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
     SDValue N0Op0 = N0.getOperand(0);
@@ -4675,16 +4677,16 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
 
   // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
   // iff (c1 & c2) != 0.
-  if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse()) {
-    if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
-      if (C1->getAPIntValue().intersects(N1C->getAPIntValue())) {
-        if (SDValue COR =
-                DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, N1C, C1))
-          return DAG.getNode(
-              ISD::AND, SDLoc(N), VT,
-              DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR);
-        return SDValue();
-      }
+  auto MatchIntersect = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
+    return LHS->getAPIntValue().intersects(RHS->getAPIntValue());
+  };
+  if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+      matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect)) {
+    if (SDValue COR = DAG.FoldConstantArithmetic(
+            ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) {
+      SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
+      AddToWorklist(IOR.getNode());
+      return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);
     }
   }
 
@@ -5380,21 +5382,6 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     AddToWorklist(NotX.getNode());
     return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1);
   }
-  // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2))
-  if (N1C && N0.getOpcode() == ISD::XOR) {
-    if (const ConstantSDNode *N00C = getAsNonOpaqueConstant(N0.getOperand(0))) {
-      SDLoc DL(N);
-      return DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1),
-                         DAG.getConstant(N1C->getAPIntValue() ^
-                                         N00C->getAPIntValue(), DL, VT));
-    }
-    if (const ConstantSDNode *N01C = getAsNonOpaqueConstant(N0.getOperand(1))) {
-      SDLoc DL(N);
-      return DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
-                         DAG.getConstant(N1C->getAPIntValue() ^
-                                         N01C->getAPIntValue(), DL, VT));
-    }
-  }
 
   // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
@@ -10201,7 +10188,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
       case ISD::SETLT:
       case ISD::SETLE:
         std::swap(TrueOpnd, FalseOpnd);
-        // Fall through
+        LLVM_FALLTHROUGH;
       case ISD::SETOGT:
       case ISD::SETUGT:
       case ISD::SETOGE:
@@ -10555,7 +10542,7 @@ static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
     // value in one SSE register, but instruction selection cannot handle
     // FCOPYSIGN on SSE registers yet.
     EVT N1VT = N1->getValueType(0);
-    EVT N1Op0VT = N1->getOperand(0)->getValueType(0);
+    EVT N1Op0VT = N1->getOperand(0).getValueType();
     return (N1VT == N1Op0VT || N1Op0VT != MVT::f128);
   }
   return false;
@@ -13784,30 +13771,30 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     }
   }
 
-  if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
-    if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
-        !ST1->isVolatile() && ST1->getBasePtr() == Ptr &&
-        ST->getMemoryVT() == ST1->getMemoryVT()) {
-      // If this is a store followed by a store with the same value to the same
-      // location, then the store is dead/noop.
-      if (ST1->getValue() == Value) {
-        // The store is dead, remove it.
-        return Chain;
-      }
-
-      // If this is a store who's preceeding store to the same location
-      // and no one other node is chained to that store we can effectively
-      // drop the store. Do not remove stores to undef as they may be used as
-      // data sinks.
-      if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
-          !ST1->getBasePtr().isUndef()) {
-        // ST1 is fully overwritten and can be elided. Combine with it's chain
-        // value.
+  // Deal with elidable overlapping chained stores.
+  if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain))
+    if (OptLevel != CodeGenOpt::None && ST->isUnindexed() &&
+        ST1->isUnindexed() && !ST1->isVolatile() && ST1->hasOneUse() &&
+        !ST1->getBasePtr().isUndef() && !ST->isVolatile()) {
+      BaseIndexOffset STBasePtr = BaseIndexOffset::match(ST->getBasePtr(), DAG);
+      BaseIndexOffset ST1BasePtr =
+          BaseIndexOffset::match(ST1->getBasePtr(), DAG);
+      unsigned STBytes = ST->getMemoryVT().getStoreSize();
+      unsigned ST1Bytes = ST1->getMemoryVT().getStoreSize();
+      int64_t PtrDiff;
+      // If this is a store who's preceeding store to a subset of the same
+      // memory and no one other node is chained to that store we can
+      // effectively drop the store. Do not remove stores to undef as they may
+      // be used as data sinks.
+
+      if (((ST->getBasePtr() == ST1->getBasePtr()) &&
+           (ST->getValue() == ST1->getValue())) ||
+          (STBasePtr.equalBaseIndex(ST1BasePtr, DAG, PtrDiff) &&
+           (0 <= PtrDiff) && (PtrDiff + ST1Bytes <= STBytes))) {
         CombineTo(ST1, ST1->getChain());
-        return SDValue();
+        return SDValue(N, 0);
       }
     }
-  }
 
   // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
   // truncating store.  We can do this even if this is already a truncstore.
@@ -15110,7 +15097,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
 
     // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr).
     if (In->getOpcode() == ISD::BITCAST &&
-        !In->getOperand(0)->getValueType(0).isVector()) {
+        !In->getOperand(0).getValueType().isVector()) {
       SDValue Scalar = In->getOperand(0);
 
       // If the bitcast type isn't legal, it might be a trunc of a legal type;
@@ -15157,7 +15144,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
       bool FoundMinVT = false;
       for (const SDValue &Op : N->ops())
         if (ISD::BUILD_VECTOR == Op.getOpcode()) {
-          EVT OpSVT = Op.getOperand(0)->getValueType(0);
+          EVT OpSVT = Op.getOperand(0).getValueType();
           MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
           FoundMinVT = true;
         }
@@ -17418,43 +17405,6 @@ SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
   return buildSqrtEstimateImpl(Op, Flags, false);
 }
 
-/// Return true if base is a frame index, which is known not to alias with
-/// anything but itself.  Provides base object and offset as results.
-static bool findBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
-                           const GlobalValue *&GV, const void *&CV) {
-  // Assume it is a primitive operation.
-  Base = Ptr; Offset = 0; GV = nullptr; CV = nullptr;
-
-  // If it's an adding a simple constant then integrate the offset.
-  if (Base.getOpcode() == ISD::ADD) {
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) {
-      Base = Base.getOperand(0);
-      Offset += C->getSExtValue();
-    }
-  }
-
-  // Return the underlying GlobalValue, and update the Offset.  Return false
-  // for GlobalAddressSDNode since the same GlobalAddress may be represented
-  // by multiple nodes with different offsets.
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Base)) {
-    GV = G->getGlobal();
-    Offset += G->getOffset();
-    return false;
-  }
-
-  // Return the underlying Constant value, and update the Offset.  Return false
-  // for ConstantSDNodes since the same constant pool entry may be represented
-  // by multiple nodes with different offsets.
-  if (ConstantPoolSDNode *C = dyn_cast<ConstantPoolSDNode>(Base)) {
-    CV = C->isMachineConstantPoolEntry() ? (const void *)C->getMachineCPVal()
-                                         : (const void *)C->getConstVal();
-    Offset += C->getOffset();
-    return false;
-  }
-  // If it's any of the following then it can't alias with anything but itself.
-  return isa<FrameIndexSDNode>(Base);
-}
-
 /// Return true if there is any possibility that the two addresses overlap.
 bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   // If they are the same then they must be aliases.
@@ -17496,39 +17446,18 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
         return false;
     }
 
-  // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis
-  // modified to use BaseIndexOffset.
-
-  // Gather base node and offset information.
-  SDValue Base0, Base1;
-  int64_t Offset0, Offset1;
-  const GlobalValue *GV0, *GV1;
-  const void *CV0, *CV1;
-  bool IsFrameIndex0 = findBaseOffset(Op0->getBasePtr(),
-                                      Base0, Offset0, GV0, CV0);
-  bool IsFrameIndex1 = findBaseOffset(Op1->getBasePtr(),
-                                      Base1, Offset1, GV1, CV1);
-
-  // If they have the same base address, then check to see if they overlap.
-  if (Base0 == Base1 || (GV0 && (GV0 == GV1)) || (CV0 && (CV0 == CV1)))
-    return !((Offset0 + NumBytes0) <= Offset1 ||
-             (Offset1 + NumBytes1) <= Offset0);
-
-  // It is possible for different frame indices to alias each other, mostly
-  // when tail call optimization reuses return address slots for arguments.
-  // To catch this case, look up the actual index of frame indices to compute
-  // the real alias relationship.
-  if (IsFrameIndex0 && IsFrameIndex1) {
-    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-    Offset0 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base0)->getIndex());
-    Offset1 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base1)->getIndex());
-    return !((Offset0 + NumBytes0) <= Offset1 ||
-             (Offset1 + NumBytes1) <= Offset0);
-  }
-
-  // Otherwise, if we know what the bases are, and they aren't identical, then
-  // we know they cannot alias.
-  if ((IsFrameIndex0 || CV0 || GV0) && (IsFrameIndex1 || CV1 || GV1))
+  bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase());
+  bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase());
+  bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase());
+  bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase());
+  bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
+  bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());
+
+  // If of mismatched base types or checkable indices we can check
+  // they do not alias.
+  if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) ||
+       (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) &&
+      (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1))
     return false;
 
   // If we know required SrcValue1 and SrcValue2 have relatively large alignment
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index eaf177d0661b..e28a3aa47ca3 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1887,7 +1887,7 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_STORE(SDNode *N, unsigned OpNo) {
   SDLoc DL(N);
 
   SDValue Promoted = GetPromotedFloat(Val);
-  EVT VT = ST->getOperand(1)->getValueType(0);
+  EVT VT = ST->getOperand(1).getValueType();
   EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
 
   SDValue NewVal;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index b60d7bca498a..4438ee7878b8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -224,7 +224,7 @@ bool DAGTypeLegalizer::run() {
     assert(N->getNodeId() == ReadyToProcess &&
            "Node should be ready if on worklist!");
 
-    DEBUG(dbgs() << "Legalizing node: "; N->dump());
+    DEBUG(dbgs() << "Legalizing node: "; N->dump(&DAG));
     if (IgnoreNodeResults(N)) {
       DEBUG(dbgs() << "Ignoring node results\n");
       goto ScanOperands;
@@ -296,7 +296,7 @@ ScanOperands:
         continue;
 
       const auto Op = N->getOperand(i);
-      DEBUG(dbgs() << "Analyzing operand: "; Op.dump());
+      DEBUG(dbgs() << "Analyzing operand: "; Op.dump(&DAG));
       EVT OpVT = Op.getValueType();
       switch (getTypeAction(OpVT)) {
       case TargetLowering::TypeLegal:
@@ -445,7 +445,7 @@ NodeDone:
         if (!isTypeLegal(Node.getValueType(i)) &&
             !TLI.isTypeLegal(Node.getValueType(i))) {
           dbgs() << "Result type " << i << " illegal: ";
-          Node.dump();
+          Node.dump(&DAG);
           Failed = true;
         }
 
@@ -455,7 +455,7 @@ NodeDone:
           !isTypeLegal(Node.getOperand(i).getValueType()) &&
           !TLI.isTypeLegal(Node.getOperand(i).getValueType())) {
         dbgs() << "Operand type " << i << " illegal: ";
-        Node.getOperand(i).dump();
+        Node.getOperand(i).dump(&DAG);
         Failed = true;
       }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 8f2320f52a0f..ce1c01b621f0 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -331,7 +331,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
     // At least try the common case where the boolean is generated by a
     // comparison.
     if (Cond->getOpcode() == ISD::SETCC) {
-      EVT OpVT = Cond->getOperand(0)->getValueType(0);
+      EVT OpVT = Cond->getOperand(0).getValueType();
       ScalarBool = TLI.getBooleanContents(OpVT.getScalarType());
       VecBool = TLI.getBooleanContents(OpVT);
     } else
@@ -1548,14 +1548,14 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
       break;
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT:
-      if (N->getValueType(0).bitsLT(N->getOperand(0)->getValueType(0)))
+      if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType()))
         Res = SplitVecOp_TruncateHelper(N);
       else
         Res = SplitVecOp_UnaryOp(N);
       break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:
-      if (N->getValueType(0).bitsLT(N->getOperand(0)->getValueType(0)))
+      if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType()))
         Res = SplitVecOp_TruncateHelper(N);
       else
         Res = SplitVecOp_UnaryOp(N);
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 379f0dcef513..7f369c746d24 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -252,6 +252,7 @@ bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) {
       if (!ResourcesModel->canReserveResources(&TII->get(
           SU->getNode()->getMachineOpcode())))
            return false;
+      break;
     case TargetOpcode::EXTRACT_SUBREG:
     case TargetOpcode::INSERT_SUBREG:
     case TargetOpcode::SUBREG_TO_REG:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 12a21e74079e..a04c770c51c4 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3750,6 +3750,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       case ISD::FP_TO_SINT:
       case ISD::FP_TO_UINT:
       case ISD::TRUNCATE:
+      case ISD::ANY_EXTEND:
+      case ISD::ZERO_EXTEND:
+      case ISD::SIGN_EXTEND:
       case ISD::UINT_TO_FP:
       case ISD::SINT_TO_FP:
       case ISD::ABS:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 544da362be69..d5980919d03c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -37,6 +37,23 @@ bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
           return true;
         }
 
+    // Match Constants
+    if (auto *A = dyn_cast<ConstantPoolSDNode>(Base))
+      if (auto *B = dyn_cast<ConstantPoolSDNode>(Other.Base)) {
+        bool IsMatch =
+            A->isMachineConstantPoolEntry() == B->isMachineConstantPoolEntry();
+        if (IsMatch) {
+          if (A->isMachineConstantPoolEntry())
+            IsMatch = A->getMachineCPVal() == B->getMachineCPVal();
+          else
+            IsMatch = A->getConstVal() == B->getConstVal();
+        }
+        if (IsMatch) {
+          Off += B->getOffset() - A->getOffset();
+          return true;
+        }
+      }
+
     const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
 
     // Match non-equal FrameIndexes - If both frame indices are fixed
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 18f6997ef83c..d13ccc263718 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -3117,7 +3117,16 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       continue;
     }
     case OPC_RecordMemRef:
-      MatchedMemRefs.push_back(cast<MemSDNode>(N)->getMemOperand());
+      if (auto *MN = dyn_cast<MemSDNode>(N))
+        MatchedMemRefs.push_back(MN->getMemOperand());
+      else {
+        DEBUG(
+          dbgs() << "Expected MemSDNode ";
+          N->dump(CurDAG);
+          dbgs() << '\n'
+        );
+      }
+
       continue;
 
     case OPC_CaptureGlueInput:
@@ -3563,7 +3572,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         Ops.push_back(InputGlue);
 
       // Create the node.
-      SDNode *Res = nullptr;
+      MachineSDNode *Res = nullptr;
       bool IsMorphNodeTo = Opcode == OPC_MorphNodeTo ||
                      (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2);
       if (!IsMorphNodeTo) {
@@ -3589,7 +3598,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
                  "Chain node replaced during MorphNode");
           Chain.erase(std::remove(Chain.begin(), Chain.end(), N), Chain.end());
         });
-        Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops, EmitNodeInfo);
+        Res = cast<MachineSDNode>(MorphNode(NodeToMatch, TargetOpc, VTList,
+                                            Ops, EmitNodeInfo));
       }
 
       // If the node had chain/glue results, update our notion of the current
@@ -3645,13 +3655,19 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
           }
         }
 
-        cast<MachineSDNode>(Res)
-          ->setMemRefs(MemRefs, MemRefs + NumMemRefs);
+        Res->setMemRefs(MemRefs, MemRefs + NumMemRefs);
       }
 
-      DEBUG(dbgs() << "  "
-                   << (IsMorphNodeTo ? "Morphed" : "Created")
-                   << " node: "; Res->dump(CurDAG); dbgs() << "\n");
+      DEBUG(
+        if (!MatchedMemRefs.empty() && Res->memoperands_empty())
+          dbgs() << "  Dropping mem operands\n";
+        dbgs() << "  "
+               << (IsMorphNodeTo ? "Morphed" : "Created")
+               << " node: ";
+        Res->dump(CurDAG);
+
+        dbgs() << '\n';
+      );
 
       // If this was a MorphNodeTo then we're completely done!
       if (IsMorphNodeTo) {
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 58276052c10b..d76e52d78870 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3812,7 +3812,7 @@ SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
 
   Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
                       DAG.getConstant(EltSize, dl, IdxVT));
-  return DAG.getNode(ISD::ADD, dl, IdxVT, Index, VecPtr);
+  return DAG.getNode(ISD::ADD, dl, IdxVT, VecPtr, Index);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 62f662d1ade4..8fc7a4a32842 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 543c12eebb45..224ae1a3236a 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -89,6 +89,21 @@ static cl::opt<unsigned> OptsizeJumpTableDensity(
     cl::desc("Minimum density for building a jump table in "
              "an optsize function"));
 
+static bool darwinHasSinCos(const Triple &TT) {
+  assert(TT.isOSDarwin() && "should be called with darwin triple");
+  // Don't bother with 32 bit x86.
+  if (TT.getArch() == Triple::x86)
+    return false;
+  // Macos < 10.9 has no sincos_stret.
+  if (TT.isMacOSX())
+    return !TT.isMacOSXVersionLT(10, 9) && TT.isArch64Bit();
+  // iOS < 7.0 has no sincos_stret.
+  if (TT.isiOS())
+    return !TT.isOSVersionLT(7, 0);
+  // Any other darwin such as WatchOS/TvOS is new enough.
+  return true;
+}
+
 // Although this default value is arbitrary, it is not random. It is assumed
 // that a condition that evaluates the same way by a higher percentage than this
 // is best represented as control flow. Therefore, the default value N should be
@@ -100,44 +115,56 @@ static cl::opt<int> MinPercentageForPredictableBranch(
              "or false to assume that the condition is predictable"),
     cl::Hidden);
 
-/// InitLibcallNames - Set default libcall names.
-static void InitLibcallNames(const char **Names, const Triple &TT) {
+void TargetLoweringBase::InitLibcalls(const Triple &TT) {
 #define HANDLE_LIBCALL(code, name) \
-  Names[RTLIB::code] = name;
+  setLibcallName(RTLIB::code, name);
 #include "llvm/CodeGen/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
+  // Initialize calling conventions to their default.
+  for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
+    setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);
 
   // A few names are different on particular architectures or environments.
   if (TT.isOSDarwin()) {
     // For f16/f32 conversions, Darwin uses the standard naming scheme, instead
     // of the gnueabi-style __gnu_*_ieee.
     // FIXME: What about other targets?
-    Names[RTLIB::FPEXT_F16_F32] = "__extendhfsf2";
-    Names[RTLIB::FPROUND_F32_F16] = "__truncsfhf2";
+    setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+    setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+
+    // Darwin 10 and higher has an optimized __bzero.
+    if (!TT.isMacOSX() || !TT.isMacOSXVersionLT(10, 6) || TT.isArch64Bit()) {
+      setLibcallName(RTLIB::BZERO, TT.isAArch64() ? "bzero" : "__bzero");
+    }
+
+    if (darwinHasSinCos(TT)) {
+      setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret");
+      setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret");
+      if (TT.isWatchABI()) {
+        setLibcallCallingConv(RTLIB::SINCOS_STRET_F32,
+                              CallingConv::ARM_AAPCS_VFP);
+        setLibcallCallingConv(RTLIB::SINCOS_STRET_F64,
+                              CallingConv::ARM_AAPCS_VFP);
+      }
+    }
   } else {
-    Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee";
-    Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee";
+    setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee");
+    setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee");
   }
 
   if (TT.isGNUEnvironment() || TT.isOSFuchsia()) {
-    Names[RTLIB::SINCOS_F32] = "sincosf";
-    Names[RTLIB::SINCOS_F64] = "sincos";
-    Names[RTLIB::SINCOS_F80] = "sincosl";
-    Names[RTLIB::SINCOS_F128] = "sincosl";
-    Names[RTLIB::SINCOS_PPCF128] = "sincosl";
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    setLibcallName(RTLIB::SINCOS_F80, "sincosl");
+    setLibcallName(RTLIB::SINCOS_F128, "sincosl");
+    setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl");
   }
 
   if (TT.isOSOpenBSD()) {
-    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = nullptr;
+    setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr);
   }
 }
 
-/// Set default libcall CallingConvs.
-static void InitLibcallCallingConvs(CallingConv::ID *CCs) {
-  for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
-    CCs[LC] = CallingConv::C;
-}
-
 /// getFPEXT - Return the FPEXT_*_* value for the given types, or
 /// UNKNOWN_LIBCALL if there is none.
 RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
@@ -524,9 +551,8 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
 
   std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr);
 
-  InitLibcallNames(LibcallRoutineNames, TM.getTargetTriple());
+  InitLibcalls(TM.getTargetTriple());
   InitCmpLibcallCCs(CmpLibcallCCs);
-  InitLibcallCallingConvs(LibcallCallingConvs);
 }
 
 void TargetLoweringBase::initActions() {
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 64bb37a280a6..13f7e83f3dd0 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -22,7 +22,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 17f29737bf93..6a6b7fc6fc20 100644
--- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -83,6 +83,7 @@ bool DWARFAcceleratorTable::validateForms() {
            !FormValue.isFormClass(DWARFFormValue::FC_Flag)) ||
           FormValue.getForm() == dwarf::DW_FORM_sdata)
         return false;
+      break;
     default:
       break;
     }
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index a5defa90eb35..eb23ca8229a3 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -88,70 +88,101 @@ static void dumpUUID(raw_ostream &OS, const ObjectFile &Obj) {
   }
 }
 
-static void
-dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName,
-                                const DWARFObject &Obj,
-                                const DWARFSection &StringOffsetsSection,
-                                StringRef StringSection, bool LittleEndian) {
+using ContributionCollection =
+    std::vector<Optional<StrOffsetsContributionDescriptor>>;
+
+// Collect all the contributions to the string offsets table from all units,
+// sort them by their starting offsets and remove duplicates.
+static ContributionCollection
+collectContributionData(DWARFContext::cu_iterator_range CUs,
+                        DWARFContext::tu_section_iterator_range TUSs) {
+  ContributionCollection Contributions;
+  for (const auto &CU : CUs)
+    Contributions.push_back(CU->getStringOffsetsTableContribution());
+  for (const auto &TUS : TUSs)
+    for (const auto &TU : TUS)
+      Contributions.push_back(TU->getStringOffsetsTableContribution());
+
+  // Sort the contributions so that any invalid ones are placed at
+  // the start of the contributions vector. This way they are reported
+  // first.
+  std::sort(Contributions.begin(), Contributions.end(),
+            [](const Optional<StrOffsetsContributionDescriptor> &L,
+               const Optional<StrOffsetsContributionDescriptor> &R) {
+              if (L && R) return L->Base < R->Base;
+              return R.hasValue();
+            });
+
+  // Uniquify contributions, as it is possible that units (specifically
+  // type units in dwo or dwp files) share contributions. We don't want
+  // to report them more than once.
+  Contributions.erase(
+      std::unique(Contributions.begin(), Contributions.end(),
+                  [](const Optional<StrOffsetsContributionDescriptor> &L,
+                     const Optional<StrOffsetsContributionDescriptor> &R) {
+                    if (L && R)
+                      return L->Base == R->Base && L->Size == R->Size;
+                    return false;
+                  }),
+      Contributions.end());
+  return Contributions;
+}
+
+static void dumpDWARFv5StringOffsetsSection(
+    raw_ostream &OS, StringRef SectionName, const DWARFObject &Obj,
+    const DWARFSection &StringOffsetsSection, StringRef StringSection,
+    DWARFContext::cu_iterator_range CUs,
+    DWARFContext::tu_section_iterator_range TUSs, bool LittleEndian) {
+  auto Contributions = collectContributionData(CUs, TUSs);
   DWARFDataExtractor StrOffsetExt(Obj, StringOffsetsSection, LittleEndian, 0);
-  uint32_t Offset = 0;
+  DataExtractor StrData(StringSection, LittleEndian, 0);
   uint64_t SectionSize = StringOffsetsSection.Data.size();
-
-  while (Offset < SectionSize) {
-    unsigned Version = 0;
-    DwarfFormat Format = DWARF32;
-    unsigned EntrySize = 4;
-    // Perform validation and extract the segment size from the header.
-    if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, 4)) {
+  uint32_t Offset = 0;
+  for (auto &Contribution : Contributions) {
+    // Report an ill-formed contribution.
+    if (!Contribution) {
       OS << "error: invalid contribution to string offsets table in section ."
          << SectionName << ".\n";
       return;
     }
-    uint32_t ContributionStart = Offset;
-    uint64_t ContributionSize = StrOffsetExt.getU32(&Offset);
-    // A contribution size of 0xffffffff indicates DWARF64, with the actual size
-    // in the following 8 bytes. Otherwise, the DWARF standard mandates that
-    // the contribution size must be at most 0xfffffff0.
-    if (ContributionSize == 0xffffffff) {
-      if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, 8)) {
-        OS << "error: invalid contribution to string offsets table in section ."
-           << SectionName << ".\n";
-        return;
-      }
-      Format = DWARF64;
-      EntrySize = 8;
-      ContributionSize = StrOffsetExt.getU64(&Offset);
-    } else if (ContributionSize > 0xfffffff0) {
-      OS << "error: invalid contribution to string offsets table in section ."
+
+    dwarf::DwarfFormat Format = Contribution->getFormat();
+    uint16_t Version = Contribution->getVersion();
+    uint64_t ContributionHeader = Contribution->Base;
+    // In DWARF v5 there is a contribution header that immediately precedes
+    // the string offsets base (the location we have previously retrieved from
+    // the CU DIE's DW_AT_str_offsets attribute). The header is located either
+    // 8 or 16 bytes before the base, depending on the contribution's format.
+    if (Version >= 5)
+      ContributionHeader -= Format == DWARF32 ? 8 : 16;
+
+    // Detect overlapping contributions.
+    if (Offset > ContributionHeader) {
+      OS << "error: overlapping contributions to string offsets table in "
+            "section ."
          << SectionName << ".\n";
       return;
     }
-
-    // We must ensure that we don't read a partial record at the end, so we
-    // validate for a multiple of EntrySize. Also, we're expecting a version
-    // number and padding, which adds an additional 4 bytes.
-    uint64_t ValidationSize =
-        4 + ((ContributionSize + EntrySize - 1) & (-(uint64_t)EntrySize));
-    if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, ValidationSize)) {
-      OS << "error: contribution to string offsets table in section ."
-         << SectionName << " has invalid length.\n";
-      return;
+    // Report a gap in the table.
+    if (Offset < ContributionHeader) {
+      OS << format("0x%8.8x: Gap, length = ", Offset);
+      OS << (ContributionHeader - Offset) << "\n";
     }
-
-    Version = StrOffsetExt.getU16(&Offset);
-    Offset += 2;
-    OS << format("0x%8.8x: ", ContributionStart);
-    OS << "Contribution size = " << ContributionSize
+    OS << format("0x%8.8x: ", (uint32_t)ContributionHeader);
+    OS << "Contribution size = " << Contribution->Size
+       << ", Format = " << (Format == DWARF32 ? "DWARF32" : "DWARF64")
        << ", Version = " << Version << "\n";
 
-    uint32_t ContributionBase = Offset;
-    DataExtractor StrData(StringSection, LittleEndian, 0);
-    while (Offset - ContributionBase < ContributionSize) {
+    Offset = Contribution->Base;
+    unsigned EntrySize = Contribution->getDwarfOffsetByteSize();
+    while (Offset - Contribution->Base < Contribution->Size) {
       OS << format("0x%8.8x: ", Offset);
-      // FIXME: We can only extract strings in DWARF32 format at the moment.
+      // FIXME: We can only extract strings if the offset fits in 32 bits.
       uint64_t StringOffset =
           StrOffsetExt.getRelocatedValue(EntrySize, &Offset);
-      if (Format == DWARF32) {
+      // Extract the string if we can and display it. Otherwise just report
+      // the offset.
+      if (StringOffset <= std::numeric_limits<uint32_t>::max()) {
         uint32_t StringOffset32 = (uint32_t)StringOffset;
         OS << format("%8.8x ", StringOffset32);
         const char *S = StrData.getCStr(&StringOffset32);
@@ -162,6 +193,11 @@ dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName,
       OS << "\n";
     }
   }
+  // Report a gap at the end of the table.
+  if (Offset < SectionSize) {
+    OS << format("0x%8.8x: Gap, length = ", Offset);
+    OS << (SectionSize - Offset) << "\n";
+  }
 }
 
 // Dump a DWARF string offsets section. This may be a DWARF v5 formatted
@@ -170,17 +206,18 @@ dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName,
 // a header containing size and version number. Alternatively, it may be a
 // monolithic series of string offsets, as generated by the pre-DWARF v5
 // implementation of split DWARF.
-static void dumpStringOffsetsSection(raw_ostream &OS, StringRef SectionName,
-                                     const DWARFObject &Obj,
-                                     const DWARFSection &StringOffsetsSection,
-                                     StringRef StringSection, bool LittleEndian,
-                                     unsigned MaxVersion) {
+static void dumpStringOffsetsSection(
+    raw_ostream &OS, StringRef SectionName, const DWARFObject &Obj,
+    const DWARFSection &StringOffsetsSection, StringRef StringSection,
+    DWARFContext::cu_iterator_range CUs,
+    DWARFContext::tu_section_iterator_range TUSs, bool LittleEndian,
+    unsigned MaxVersion) {
   // If we have at least one (compile or type) unit with DWARF v5 or greater,
   // we assume that the section is formatted like a DWARF v5 string offsets
   // section.
   if (MaxVersion >= 5)
     dumpDWARFv5StringOffsetsSection(OS, SectionName, Obj, StringOffsetsSection,
-                                    StringSection, LittleEndian);
+                                    StringSection, CUs, TUSs, LittleEndian);
   else {
     DataExtractor strOffsetExt(StringOffsetsSection.Data, LittleEndian, 0);
     uint32_t offset = 0;
@@ -468,12 +505,14 @@ void DWARFContext::dump(
                  DObj->getStringOffsetSection().Data))
     dumpStringOffsetsSection(
         OS, "debug_str_offsets", *DObj, DObj->getStringOffsetSection(),
-        DObj->getStringSection(), isLittleEndian(), getMaxVersion());
+        DObj->getStringSection(), compile_units(), type_unit_sections(),
+        isLittleEndian(), getMaxVersion());
   if (shouldDump(ExplicitDWO, ".debug_str_offsets.dwo", DIDT_ID_DebugStrOffsets,
                  DObj->getStringOffsetDWOSection().Data))
     dumpStringOffsetsSection(
         OS, "debug_str_offsets.dwo", *DObj, DObj->getStringOffsetDWOSection(),
-        DObj->getStringDWOSection(), isLittleEndian(), getMaxVersion());
+        DObj->getStringDWOSection(), dwo_compile_units(),
+        dwo_type_unit_sections(), isLittleEndian(), getMaxVersion());
 
   if (shouldDump(Explicit, ".gnu_index", DIDT_ID_GdbIndex,
                  DObj->getGdbIndexSection())) {
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index c3d8ff2cbc29..df55d7debf92 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
@@ -79,8 +80,10 @@ bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
 
 bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
                                            uint64_t &Result) const {
-  unsigned ItemSize = getDwarfOffsetByteSize();
-  uint32_t Offset = StringOffsetSectionBase + Index * ItemSize;
+  if (!StringOffsetsTableContribution)
+    return false;
+  unsigned ItemSize = getDwarfStringOffsetsByteSize();
+  uint32_t Offset = getStringOffsetsBase() + Index * ItemSize;
   if (StringOffsetSection.Data.size() < Offset + ItemSize)
     return false;
   DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection,
@@ -251,15 +254,28 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
       RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0);
     }
 
-    // In general, we derive the offset of the unit's contibution to the
-    // debug_str_offsets{.dwo} section from the unit DIE's
-    // DW_AT_str_offsets_base attribute. In dwp files we add to it the offset
-    // we get from the index table.
-    StringOffsetSectionBase =
-        toSectionOffset(UnitDie.find(DW_AT_str_offsets_base), 0);
+    // In general, in DWARF v5 and beyond we derive the start of the unit's
+    // contribution to the string offsets table from the unit DIE's
+    // DW_AT_str_offsets_base attribute. Split DWARF units do not use this
+    // attribute, so we assume that there is a contribution to the string
+    // offsets table starting at offset 0 of the debug_str_offsets.dwo section.
+    // In both cases we need to determine the format of the contribution,
+    // which may differ from the unit's format.
+    uint64_t StringOffsetsContributionBase =
+        isDWO ? 0 : toSectionOffset(UnitDie.find(DW_AT_str_offsets_base), 0);
     if (IndexEntry)
       if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
-        StringOffsetSectionBase += C->Offset;
+        StringOffsetsContributionBase += C->Offset;
+
+    DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection,
+                          isLittleEndian, 0);
+    if (isDWO)
+      StringOffsetsTableContribution =
+          determineStringOffsetsTableContributionDWO(
+              DA, StringOffsetsContributionBase);
+    else if (getVersion() >= 5)
+      StringOffsetsTableContribution = determineStringOffsetsTableContribution(
+          DA, StringOffsetsContributionBase);
 
     // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
     // skeleton CU DIE, so that DWARF users not aware of it are not broken.
@@ -344,45 +360,378 @@ void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) {
     clearDIEs(true);
 }
 
-void DWARFUnit::updateAddressDieMap(DWARFDie Die) {
-  if (Die.isSubroutineDIE()) {
+// Populates a map from PC addresses to subprogram DIEs.
+//
+// This routine tries to look at the smallest amount of the debug info it can
+// to locate the DIEs. This is because many subprograms will never end up being
+// read or needed at all. We want to be as lazy as possible.
+void DWARFUnit::buildSubprogramDIEAddrMap() {
+  assert(SubprogramDIEAddrMap.empty() && "Must only build this map once!");
+  SmallVector<DWARFDie, 16> Worklist;
+  Worklist.push_back(getUnitDIE());
+  do {
+    DWARFDie Die = Worklist.pop_back_val();
+
+    // Queue up child DIEs to recurse through.
+    // FIXME: This causes us to read a lot more debug info than we really need.
+    // We should look at pruning out DIEs which cannot transitively hold
+    // separate subprograms.
+    for (DWARFDie Child : Die.children())
+      Worklist.push_back(Child);
+
+    // If handling a non-subprogram DIE, nothing else to do.
+    if (!Die.isSubprogramDIE())
+      continue;
+
+    // For subprogram DIEs, store them, and insert relevant markers into the
+    // address map. We don't care about overlap at all here as DWARF doesn't
+    // meaningfully support that, so we simply will insert a range with no DIE
+    // starting from the high PC. In the event there are overlaps, sorting
+    // these may truncate things in surprising ways but still will allow
+    // lookups to proceed.
+    int DIEIndex = SubprogramDIEAddrInfos.size();
+    SubprogramDIEAddrInfos.push_back({Die, (uint64_t)-1, {}});
     for (const auto &R : Die.getAddressRanges()) {
       // Ignore 0-sized ranges.
       if (R.LowPC == R.HighPC)
         continue;
-      auto B = AddrDieMap.upper_bound(R.LowPC);
-      if (B != AddrDieMap.begin() && R.LowPC < (--B)->second.first) {
-        // The range is a sub-range of existing ranges, we need to split the
-        // existing range.
-        if (R.HighPC < B->second.first)
-          AddrDieMap[R.HighPC] = B->second;
-        if (R.LowPC > B->first)
-          AddrDieMap[B->first].first = R.LowPC;
+
+      SubprogramDIEAddrMap.push_back({R.LowPC, DIEIndex});
+      SubprogramDIEAddrMap.push_back({R.HighPC, -1});
+
+      if (R.LowPC < SubprogramDIEAddrInfos.back().SubprogramBasePC)
+        SubprogramDIEAddrInfos.back().SubprogramBasePC = R.LowPC;
+    }
+  } while (!Worklist.empty());
+
+  if (SubprogramDIEAddrMap.empty()) {
+    // If we found no ranges, create a no-op map so that lookups remain simple
+    // but never find anything.
+    SubprogramDIEAddrMap.push_back({0, -1});
+    return;
+  }
+
+  // Next, sort the ranges and remove both exact duplicates and runs with the
+  // same DIE index. We order the ranges so that non-empty ranges are
+  // preferred. Because there may be ties, we also need to use stable sort.
+  std::stable_sort(SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(),
+                   [](const std::pair<uint64_t, int64_t> &LHS,
+                      const std::pair<uint64_t, int64_t> &RHS) {
+                     if (LHS.first < RHS.first)
+                       return true;
+                     if (LHS.first > RHS.first)
+                       return false;
+
+                     // For ranges that start at the same address, keep the one
+                     // with a DIE.
+                     if (LHS.second != -1 && RHS.second == -1)
+                       return true;
+
+                     return false;
+                   });
+  SubprogramDIEAddrMap.erase(
+      std::unique(SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(),
+                  [](const std::pair<uint64_t, int64_t> &LHS,
+                     const std::pair<uint64_t, int64_t> &RHS) {
+                    // If the start addresses are exactly the same, we can
+                    // remove all but the first one as it is the only one that
+                    // will be found and used.
+                    //
+                    // If the DIE indices are the same, we can "merge" the
+                    // ranges by eliminating the second.
+                    return LHS.first == RHS.first || LHS.second == RHS.second;
+                  }),
+      SubprogramDIEAddrMap.end());
+
+  assert(SubprogramDIEAddrMap.back().second == -1 &&
+         "The last interval must not have a DIE as each DIE's address range is "
+         "bounded.");
+}
+
+// Build the second level of mapping from PC to DIE, specifically one that maps
+// a PC *within* a particular DWARF subprogram into a precise, maximally nested
+// inlined subroutine DIE (if any exists). We build a separate map for each
+// subprogram because many subprograms will never get queried for an address
+// and this allows us to be significantly lazier in reading the DWARF itself.
+void DWARFUnit::buildInlinedSubroutineDIEAddrMap(
+    SubprogramDIEAddrInfo &SPInfo) {
+  auto &AddrMap = SPInfo.InlinedSubroutineDIEAddrMap;
+  uint64_t BasePC = SPInfo.SubprogramBasePC;
+
+  auto SubroutineAddrMapSorter = [](const std::pair<int, int> &LHS,
+                                    const std::pair<int, int> &RHS) {
+    if (LHS.first < RHS.first)
+      return true;
+    if (LHS.first > RHS.first)
+      return false;
+
+    // For ranges that start at the same address, keep the
+    // non-empty one.
+    if (LHS.second != -1 && RHS.second == -1)
+      return true;
+
+    return false;
+  };
+  auto SubroutineAddrMapUniquer = [](const std::pair<int, int> &LHS,
+                                     const std::pair<int, int> &RHS) {
+    // If the start addresses are exactly the same, we can
+    // remove all but the first one as it is the only one that
+    // will be found and used.
+    //
+    // If the DIE indices are the same, we can "merge" the
+    // ranges by eliminating the second.
+    return LHS.first == RHS.first || LHS.second == RHS.second;
+  };
+
+  struct DieAndParentIntervalRange {
+    DWARFDie Die;
+    int ParentIntervalsBeginIdx, ParentIntervalsEndIdx;
+  };
+
+  SmallVector<DieAndParentIntervalRange, 16> Worklist;
+  auto EnqueueChildDIEs = [&](const DWARFDie &Die, int ParentIntervalsBeginIdx,
+                              int ParentIntervalsEndIdx) {
+    for (DWARFDie Child : Die.children())
+      Worklist.push_back(
+          {Child, ParentIntervalsBeginIdx, ParentIntervalsEndIdx});
+  };
+  EnqueueChildDIEs(SPInfo.SubprogramDIE, 0, 0);
+  while (!Worklist.empty()) {
+    DWARFDie Die = Worklist.back().Die;
+    int ParentIntervalsBeginIdx = Worklist.back().ParentIntervalsBeginIdx;
+    int ParentIntervalsEndIdx = Worklist.back().ParentIntervalsEndIdx;
+    Worklist.pop_back();
+
+    // If we encounter a nested subprogram, simply ignore it. We map to
+    // (disjoint) subprograms before arriving here and we don't want to examine
+    // any inlined subroutines of an unrelated subpragram.
+    if (Die.getTag() == DW_TAG_subprogram)
+      continue;
+
+    // For non-subroutines, just recurse to keep searching for inlined
+    // subroutines.
+    if (Die.getTag() != DW_TAG_inlined_subroutine) {
+      EnqueueChildDIEs(Die, ParentIntervalsBeginIdx, ParentIntervalsEndIdx);
+      continue;
+    }
+
+    // Capture the inlined subroutine DIE that we will reference from the map.
+    int DIEIndex = InlinedSubroutineDIEs.size();
+    InlinedSubroutineDIEs.push_back(Die);
+
+    int DieIntervalsBeginIdx = AddrMap.size();
+    // First collect the PC ranges for this DIE into our subroutine interval
+    // map.
+    for (auto R : Die.getAddressRanges()) {
+      // Clamp the PCs to be above the base.
+      R.LowPC = std::max(R.LowPC, BasePC);
+      R.HighPC = std::max(R.HighPC, BasePC);
+      // Compute relative PCs from the subprogram base and drop down to an
+      // unsigned 32-bit int to represent them within the data structure. This
+      // lets us cover a 4gb single subprogram. Because subprograms may be
+      // partitioned into distant parts of a binary (think hot/cold
+      // partitioning) we want to preserve as much as we can here without
+      // burning extra memory. Past that, we will simply truncate and lose the
+      // ability to map those PCs to a DIE more precise than the subprogram.
+      const uint32_t MaxRelativePC = std::numeric_limits<uint32_t>::max();
+      uint32_t RelativeLowPC = (R.LowPC - BasePC) > (uint64_t)MaxRelativePC
+                                   ? MaxRelativePC
+                                   : (uint32_t)(R.LowPC - BasePC);
+      uint32_t RelativeHighPC = (R.HighPC - BasePC) > (uint64_t)MaxRelativePC
+                                    ? MaxRelativePC
+                                    : (uint32_t)(R.HighPC - BasePC);
+      // Ignore empty or bogus ranges.
+      if (RelativeLowPC >= RelativeHighPC)
+        continue;
+      AddrMap.push_back({RelativeLowPC, DIEIndex});
+      AddrMap.push_back({RelativeHighPC, -1});
+    }
+
+    // If there are no address ranges, there is nothing to do to map into them
+    // and there cannot be any child subroutine DIEs with address ranges of
+    // interest as those would all be required to nest within this DIE's
+    // non-existent ranges, so we can immediately continue to the next DIE in
+    // the worklist.
+    if (DieIntervalsBeginIdx == (int)AddrMap.size())
+      continue;
+
+    // The PCs from this DIE should never overlap, so we can easily sort them
+    // here.
+    std::sort(AddrMap.begin() + DieIntervalsBeginIdx, AddrMap.end(),
+              SubroutineAddrMapSorter);
+    // Remove any dead ranges. These should only come from "empty" ranges that
+    // were clobbered by some other range.
+    AddrMap.erase(std::unique(AddrMap.begin() + DieIntervalsBeginIdx,
+                              AddrMap.end(), SubroutineAddrMapUniquer),
+                  AddrMap.end());
+
+    // Compute the end index of this DIE's addr map intervals.
+    int DieIntervalsEndIdx = AddrMap.size();
+
+    assert(DieIntervalsBeginIdx != DieIntervalsEndIdx &&
+           "Must not have an empty map for this layer!");
+    assert(AddrMap.back().second == -1 && "Must end with an empty range!");
+    assert(std::is_sorted(AddrMap.begin() + DieIntervalsBeginIdx, AddrMap.end(),
+                          less_first()) &&
+           "Failed to sort this DIE's interals!");
+
+    // If we have any parent intervals, walk the newly added ranges and find
+    // the parent ranges they were inserted into. Both of these are sorted and
+    // neither has any overlaps. We need to append new ranges to split up any
+    // parent ranges these new ranges would overlap when we merge them.
+    if (ParentIntervalsBeginIdx != ParentIntervalsEndIdx) {
+      int ParentIntervalIdx = ParentIntervalsBeginIdx;
+      for (int i = DieIntervalsBeginIdx, e = DieIntervalsEndIdx - 1; i < e;
+           ++i) {
+        const uint32_t IntervalStart = AddrMap[i].first;
+        const uint32_t IntervalEnd = AddrMap[i + 1].first;
+        const int IntervalDieIdx = AddrMap[i].second;
+        if (IntervalDieIdx == -1) {
+          // For empty intervals, nothing is required. This is a bit surprising
+          // however. If the prior interval overlaps a parent interval and this
+          // would be necessary to mark the end, we will synthesize a new end
+          // that switches back to the parent DIE below. And this interval will
+          // get dropped in favor of one with a DIE attached. However, we'll
+          // still include this and so worst-case, it will still end the prior
+          // interval.
+          continue;
+        }
+
+        // We are walking the new ranges in order, so search forward from the
+        // last point for a parent range that might overlap.
+        auto ParentIntervalsRange =
+            make_range(AddrMap.begin() + ParentIntervalIdx,
+                       AddrMap.begin() + ParentIntervalsEndIdx);
+        assert(std::is_sorted(ParentIntervalsRange.begin(),
+                              ParentIntervalsRange.end(), less_first()) &&
+               "Unsorted parent intervals can't be searched!");
+        auto PI = std::upper_bound(
+            ParentIntervalsRange.begin(), ParentIntervalsRange.end(),
+            IntervalStart,
+            [](uint32_t LHS, const std::pair<uint32_t, int32_t> &RHS) {
+              return LHS < RHS.first;
+            });
+        if (PI == ParentIntervalsRange.begin() ||
+            PI == ParentIntervalsRange.end())
+          continue;
+
+        ParentIntervalIdx = PI - AddrMap.begin();
+        int32_t &ParentIntervalDieIdx = std::prev(PI)->second;
+        uint32_t &ParentIntervalStart = std::prev(PI)->first;
+        const uint32_t ParentIntervalEnd = PI->first;
+
+        // If the new range starts exactly at the position of the parent range,
+        // we need to adjust the parent range. Note that these collisions can
+        // only happen with the original parent range because we will merge any
+        // adjacent ranges in the child.
+        if (IntervalStart == ParentIntervalStart) {
+          // If there will be a tail, just shift the start of the parent
+          // forward. Note that this cannot change the parent ordering.
+          if (IntervalEnd < ParentIntervalEnd) {
+            ParentIntervalStart = IntervalEnd;
+            continue;
+          }
+          // Otherwise, mark this as becoming empty so we'll remove it and
+          // prefer the child range.
+          ParentIntervalDieIdx = -1;
+          continue;
+        }
+
+        // Finally, if the parent interval will need to remain as a prefix to
+        // this one, insert a new interval to cover any tail.
+        if (IntervalEnd < ParentIntervalEnd)
+          AddrMap.push_back({IntervalEnd, ParentIntervalDieIdx});
       }
-      AddrDieMap[R.LowPC] = std::make_pair(R.HighPC, Die);
     }
+
+    // Note that we don't need to re-sort even this DIE's address map intervals
+    // after this. All of the newly added intervals actually fill in *gaps* in
+    // this DIE's address map, and we know that children won't need to lookup
+    // into those gaps.
+
+    // Recurse through its children, giving them the interval map range of this
+    // DIE to use as their parent intervals.
+    EnqueueChildDIEs(Die, DieIntervalsBeginIdx, DieIntervalsEndIdx);
+  }
+
+  if (AddrMap.empty()) {
+    AddrMap.push_back({0, -1});
+    return;
   }
-  // Parent DIEs are added to the AddrDieMap prior to the Children DIEs to
-  // simplify the logic to update AddrDieMap. The child's range will always
-  // be equal or smaller than the parent's range. With this assumption, when
-  // adding one range into the map, it will at most split a range into 3
-  // sub-ranges.
-  for (DWARFDie Child = Die.getFirstChild(); Child; Child = Child.getSibling())
-    updateAddressDieMap(Child);
+
+  // Now that we've added all of the intervals needed, we need to resort and
+  // unique them. Most notably, this will remove all the empty ranges that had
+  // a parent range covering, etc. We only expect a single non-empty interval
+  // at any given start point, so we just use std::sort. This could potentially
+  // produce non-deterministic maps for invalid DWARF.
+  std::sort(AddrMap.begin(), AddrMap.end(), SubroutineAddrMapSorter);
+  AddrMap.erase(
+      std::unique(AddrMap.begin(), AddrMap.end(), SubroutineAddrMapUniquer),
+      AddrMap.end());
 }
 
 DWARFDie DWARFUnit::getSubroutineForAddress(uint64_t Address) {
   extractDIEsIfNeeded(false);
-  if (AddrDieMap.empty())
-    updateAddressDieMap(getUnitDIE());
-  auto R = AddrDieMap.upper_bound(Address);
-  if (R == AddrDieMap.begin())
+
+  // We use a two-level mapping structure to locate subroutines for a given PC
+  // address.
+  //
+  // First, we map the address to a subprogram. This can be done more cheaply
+  // because subprograms cannot nest within each other. It also allows us to
+  // avoid detailed examination of many subprograms, instead only focusing on
+  // the ones which we end up actively querying.
+  if (SubprogramDIEAddrMap.empty())
+    buildSubprogramDIEAddrMap();
+
+  assert(!SubprogramDIEAddrMap.empty() &&
+         "We must always end up with a non-empty map!");
+
+  auto I = std::upper_bound(
+      SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(), Address,
+      [](uint64_t LHS, const std::pair<uint64_t, int64_t> &RHS) {
+        return LHS < RHS.first;
+      });
+  // If we find the beginning, then the address is before the first subprogram.
+  if (I == SubprogramDIEAddrMap.begin())
     return DWARFDie();
-  // upper_bound's previous item contains Address.
-  --R;
-  if (Address >= R->second.first)
+  // Back up to the interval containing the address and see if it
+  // has a DIE associated with it.
+  --I;
+  if (I->second == -1)
     return DWARFDie();
-  return R->second.second;
+
+  auto &SPInfo = SubprogramDIEAddrInfos[I->second];
+
+  // Now that we have the subprogram for this address, we do the second level
+  // mapping by building a map within a subprogram's PC range to any specific
+  // inlined subroutine.
+  if (SPInfo.InlinedSubroutineDIEAddrMap.empty())
+    buildInlinedSubroutineDIEAddrMap(SPInfo);
+
+  // We lookup within the inlined subroutine using a subprogram-relative
+  // address.
+  assert(Address >= SPInfo.SubprogramBasePC &&
+         "Address isn't above the start of the subprogram!");
+  uint32_t RelativeAddr = ((Address - SPInfo.SubprogramBasePC) >
+                           (uint64_t)std::numeric_limits<uint32_t>::max())
+                              ? std::numeric_limits<uint32_t>::max()
+                              : (uint32_t)(Address - SPInfo.SubprogramBasePC);
+
+  auto J =
+      std::upper_bound(SPInfo.InlinedSubroutineDIEAddrMap.begin(),
+                       SPInfo.InlinedSubroutineDIEAddrMap.end(), RelativeAddr,
+                       [](uint32_t LHS, const std::pair<uint32_t, int32_t> &RHS) {
+                         return LHS < RHS.first;
+                       });
+  // If we find the beginning, the address is before any inlined subroutine so
+  // return the subprogram DIE.
+  if (J == SPInfo.InlinedSubroutineDIEAddrMap.begin())
+    return SPInfo.SubprogramDIE;
+  // Back up `J` and return the inlined subroutine if we have one or the
+  // subprogram if we don't.
+  --J;
+  return J->second == -1 ? SPInfo.SubprogramDIE
+                         : InlinedSubroutineDIEs[J->second];
 }
 
 void
@@ -466,3 +815,89 @@ const DWARFAbbreviationDeclarationSet *DWARFUnit::getAbbreviations() const {
     Abbrevs = Abbrev->getAbbreviationDeclarationSet(AbbrOffset);
   return Abbrevs;
 }
+
+Optional<StrOffsetsContributionDescriptor>
+StrOffsetsContributionDescriptor::validateContributionSize(
+    DWARFDataExtractor &DA) {
+  uint8_t EntrySize = getDwarfOffsetByteSize();
+  // In order to ensure that we don't read a partial record at the end of
+  // the section we validate for a multiple of the entry size.
+  uint64_t ValidationSize = alignTo(Size, EntrySize);
+  // Guard against overflow.
+  if (ValidationSize >= Size)
+    if (DA.isValidOffsetForDataOfSize((uint32_t)Base, ValidationSize))
+      return *this;
+  return Optional<StrOffsetsContributionDescriptor>();
+}
+
+// Look for a DWARF64-formatted contribution to the string offsets table
+// starting at a given offset and record it in a descriptor.
+static Optional<StrOffsetsContributionDescriptor>
+parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
+  if (!DA.isValidOffsetForDataOfSize(Offset, 16))
+    return Optional<StrOffsetsContributionDescriptor>();
+
+  if (DA.getU32(&Offset) != 0xffffffff)
+    return Optional<StrOffsetsContributionDescriptor>();
+
+  uint64_t Size = DA.getU64(&Offset);
+  uint8_t Version = DA.getU16(&Offset);
+  (void)DA.getU16(&Offset); // padding
+  return StrOffsetsContributionDescriptor(Offset, Size, Version, DWARF64);
+  //return Optional<StrOffsetsContributionDescriptor>(Descriptor);
+}
+
+// Look for a DWARF32-formatted contribution to the string offsets table
+// starting at a given offset and record it in a descriptor.
+static Optional<StrOffsetsContributionDescriptor>
+parseDWARF32StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
+  if (!DA.isValidOffsetForDataOfSize(Offset, 8))
+    return Optional<StrOffsetsContributionDescriptor>();
+  uint32_t ContributionSize = DA.getU32(&Offset);
+  if (ContributionSize >= 0xfffffff0)
+    return Optional<StrOffsetsContributionDescriptor>();
+  uint8_t Version = DA.getU16(&Offset);
+  (void)DA.getU16(&Offset); // padding
+  return StrOffsetsContributionDescriptor(Offset, ContributionSize, Version, DWARF32);
+  //return Optional<StrOffsetsContributionDescriptor>(Descriptor);
+}
+
+Optional<StrOffsetsContributionDescriptor>
+DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA,
+                                                   uint64_t Offset) {
+  Optional<StrOffsetsContributionDescriptor> Descriptor;
+  // Attempt to find a DWARF64 contribution 16 bytes before the base.
+  if (Offset >= 16)
+    Descriptor =
+        parseDWARF64StringOffsetsTableHeader(DA, (uint32_t)Offset - 16);
+  // Try to find a DWARF32 contribution 8 bytes before the base.
+  if (!Descriptor && Offset >= 8)
+    Descriptor = parseDWARF32StringOffsetsTableHeader(DA, (uint32_t)Offset - 8);
+  return Descriptor ? Descriptor->validateContributionSize(DA) : Descriptor;
+}
+
+Optional<StrOffsetsContributionDescriptor>
+DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA,
+                                                      uint64_t Offset) {
+  if (getVersion() >= 5) {
+    // Look for a valid contribution at the given offset.
+    auto Descriptor =
+        parseDWARF64StringOffsetsTableHeader(DA, (uint32_t)Offset);
+    if (!Descriptor)
+      Descriptor = parseDWARF32StringOffsetsTableHeader(DA, (uint32_t)Offset);
+    return Descriptor ? Descriptor->validateContributionSize(DA) : Descriptor;
+  }
+  // Prior to DWARF v5, we derive the contribution size from the
+  // index table (in a package file). In a .dwo file it is simply
+  // the length of the string offsets section.
+  uint64_t Size = 0;
+  if (!IndexEntry)
+    Size = StringOffsetSection.Data.size();
+  else if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
+    Size = C->Length;
+  // Return a descriptor with the given offset as base, version 4 and
+  // DWARF32 format.
+  //return Optional<StrOffsetsContributionDescriptor>(
+      //StrOffsetsContributionDescriptor(Offset, Size, 4, DWARF32));
+  return StrOffsetsContributionDescriptor(Offset, Size, 4, DWARF32);
+}
diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp
index 34f4017d9828..9c2258f5b933 100644
--- a/lib/Demangle/ItaniumDemangle.cpp
+++ b/lib/Demangle/ItaniumDemangle.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Demangle/Demangle.h"
+#include "llvm/Support/Compiler.h"
 
 // This file exports a single function: llvm::itanium_demangle.
 // It also has no dependencies on the rest of llvm. It is implemented this way
@@ -1947,7 +1948,7 @@ static const char *parse_type(const char *first, const char *last, C &db) {
               break;
             }
           }
-        // falls through
+          LLVM_FALLTHROUGH;
         default:
           // must check for builtin-types before class-enum-types to avoid
           // ambiguities with operator-names
diff --git a/lib/FuzzMutate/IRMutator.cpp b/lib/FuzzMutate/IRMutator.cpp
index 15e7f86d1cdf..00b558ac4dcb 100644
--- a/lib/FuzzMutate/IRMutator.cpp
+++ b/lib/FuzzMutate/IRMutator.cpp
@@ -8,15 +8,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/FuzzMutate/IRMutator.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/FuzzMutate/Operations.h"
 #include "llvm/FuzzMutate/Random.h"
 #include "llvm/FuzzMutate/RandomIRBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar/DCE.h"
 
 using namespace llvm;
@@ -90,14 +92,14 @@ std::vector<fuzzerop::OpDescriptor> InjectorIRStrategy::getDefaultOps() {
   return Ops;
 }
 
-fuzzerop::OpDescriptor
+Optional<fuzzerop::OpDescriptor>
 InjectorIRStrategy::chooseOperation(Value *Src, RandomIRBuilder &IB) {
   auto OpMatchesPred = [&Src](fuzzerop::OpDescriptor &Op) {
     return Op.SourcePreds[0].matches({}, Src);
   };
   auto RS = makeSampler(IB.Rand, make_filter_range(Operations, OpMatchesPred));
   if (RS.isEmpty())
-    report_fatal_error("No available operations for src type");
+    return None;
   return *RS;
 }
 
@@ -120,10 +122,15 @@ void InjectorIRStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) {
 
   // Choose an operation that's constrained to be valid for the type of the
   // source, collect any other sources it needs, and then build it.
-  fuzzerop::OpDescriptor OpDesc = chooseOperation(Srcs[0], IB);
-  for (const auto &Pred : makeArrayRef(OpDesc.SourcePreds).slice(1))
+  auto OpDesc = chooseOperation(Srcs[0], IB);
+  // Bail if no operation was found
+  if (!OpDesc)
+    return;
+
+  for (const auto &Pred : makeArrayRef(OpDesc->SourcePreds).slice(1))
     Srcs.push_back(IB.findOrCreateSource(BB, InstsBefore, Srcs, Pred));
-  if (Value *Op = OpDesc.BuilderFunc(Srcs, Insts[IP])) {
+
+  if (Value *Op = OpDesc->BuilderFunc(Srcs, Insts[IP])) {
     // Find a sink and wire up the results of the operation.
     IB.connectToSink(BB, InstsAfter, Op);
   }
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 90b10309b58b..59818a1425f1 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -1674,6 +1674,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
           }
         }
       }
+      break;
     }
     default:
       break;
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 1fff912ecf2f..7063f6f40a30 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -1333,7 +1333,9 @@ Optional<uint64_t> Function::getEntryCount() const {
       if (MDS->getString().equals("function_entry_count")) {
         ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(1));
         uint64_t Count = CI->getValue().getZExtValue();
-        if (Count == 0)
+        // A value of -1 is used for SamplePGO when there were no samples.
+        // Treat this the same as unknown.
+        if (Count == (uint64_t)-1)
           return None;
         return Count;
       }
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index eae697b2e4b9..163c785f5d76 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -627,9 +627,10 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
   CanBeNull = false;
   if (const Argument *A = dyn_cast<Argument>(this)) {
     DerefBytes = A->getDereferenceableBytes();
-    if (DerefBytes == 0 && A->hasByValAttr()) {
+    if (DerefBytes == 0 && (A->hasByValAttr() || A->hasStructRetAttr())) {
       Type *PT = cast<PointerType>(A->getType())->getElementType();
-      DerefBytes = DL.getTypeStoreSize(PT);
+      if (PT->isSized())
+        DerefBytes = DL.getTypeStoreSize(PT);
     }
     if (DerefBytes == 0) {
       DerefBytes = A->getDereferenceableOrNullBytes();
@@ -655,10 +656,8 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
       CanBeNull = true;
     }
   } else if (auto *AI = dyn_cast<AllocaInst>(this)) {
-    const ConstantInt *ArraySize = dyn_cast<ConstantInt>(AI->getArraySize());
-    if (ArraySize && AI->getAllocatedType()->isSized()) {
-      DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType()) *
-        ArraySize->getZExtValue();
+    if (!AI->isArrayAllocation()) {
+      DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType());
       CanBeNull = false;
     }
   } else if (auto *GV = dyn_cast<GlobalVariable>(this)) {
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 3357553cf19f..e521b6e7c704 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -405,9 +405,13 @@ void MCAsmStreamer::emitExplicitComments() {
 void MCAsmStreamer::ChangeSection(MCSection *Section,
                                   const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
-  Section->PrintSwitchToSection(
-      *MAI, getContext().getObjectFileInfo()->getTargetTriple(), OS,
-      Subsection);
+  if (MCTargetStreamer *TS = getTargetStreamer()) {
+    TS->changeSection(getCurrentSectionOnly(), Section, Subsection, OS);
+  } else {
+    Section->PrintSwitchToSection(
+        *MAI, getContext().getObjectFileInfo()->getTargetTriple(), OS,
+        Subsection);
+  }
 }
 
 void MCAsmStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
@@ -796,10 +800,15 @@ void MCAsmStreamer::EmitBytes(StringRef Data) {
          "Cannot emit contents before setting section!");
   if (Data.empty()) return;
 
-  if (Data.size() == 1) {
-    OS << MAI->getData8bitsDirective();
-    OS << (unsigned)(unsigned char)Data[0];
-    EmitEOL();
+  // If only single byte is provided or no ascii or asciz directives is
+  // supported, emit as vector of 8bits data.
+  if (Data.size() == 1 ||
+      !(MAI->getAscizDirective() || MAI->getAsciiDirective())) {
+    const char *Directive = MAI->getData8bitsDirective();
+    for (const unsigned char C : Data.bytes()) {
+      OS << Directive << (unsigned)C;
+      EmitEOL();
+    }
     return;
   }
 
@@ -884,8 +893,12 @@ void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
 
   assert(Directive && "Invalid size for machine code value!");
   OS << Directive;
-  Value->print(OS, MAI);
-  EmitEOL();
+  if (MCTargetStreamer *TS = getTargetStreamer()) {
+    TS->emitValue(Value);
+  } else {
+    Value->print(OS, MAI);
+    EmitEOL();
+  }
 }
 
 void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
@@ -1097,13 +1110,19 @@ unsigned MCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo,
     }
   }
 
-  OS << "\t.file\t" << FileNo << ' ';
+  SmallString<128> Str;
+  raw_svector_ostream OS1(Str);
+  OS1 << "\t.file\t" << FileNo << ' ';
   if (!Directory.empty()) {
-    PrintQuotedString(Directory, OS);
-    OS << ' ';
+    PrintQuotedString(Directory, OS1);
+    OS1 << ' ';
+  }
+  PrintQuotedString(Filename, OS1);
+  if (MCTargetStreamer *TS = getTargetStreamer()) {
+    TS->emitDwarfFileDirective(OS1.str());
+  } else {
+    EmitRawText(OS1.str());
   }
-  PrintQuotedString(Filename, OS);
-  EmitEOL();
 
   return FileNo;
 }
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 6f3647d61932..6e801ed8777c 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -49,6 +49,28 @@ void MCTargetStreamer::emitLabel(MCSymbol *Symbol) {}
 
 void MCTargetStreamer::finish() {}
 
+void MCTargetStreamer::changeSection(const MCSection *CurSection,
+                                     MCSection *Section,
+                                     const MCExpr *Subsection,
+                                     raw_ostream &OS) {
+  Section->PrintSwitchToSection(
+      *Streamer.getContext().getAsmInfo(),
+      Streamer.getContext().getObjectFileInfo()->getTargetTriple(), OS,
+      Subsection);
+}
+
+void MCTargetStreamer::emitDwarfFileDirective(StringRef Directive) {
+  Streamer.EmitRawText(Directive);
+}
+
+void MCTargetStreamer::emitValue(const MCExpr *Value) {
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+
+  Value->print(OS, Streamer.getContext().getAsmInfo());
+  Streamer.EmitRawText(OS.str());
+}
+
 void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index 6e76c5fac35f..0f0b645492ee 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -553,7 +553,7 @@ uint32_t WasmObjectWriter::getRelocationIndexValue(
   case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
   case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
     if (!IndirectSymbolIndices.count(RelEntry.Symbol))
-      report_fatal_error("symbol not found table index space: " +
+      report_fatal_error("symbol not found in table index space: " +
                          RelEntry.Symbol->getName());
     return IndirectSymbolIndices[RelEntry.Symbol];
   case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
@@ -562,7 +562,7 @@ uint32_t WasmObjectWriter::getRelocationIndexValue(
   case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB:
   case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
     if (!SymbolIndices.count(RelEntry.Symbol))
-      report_fatal_error("symbol not found function/global index space: " +
+      report_fatal_error("symbol not found in function/global index space: " +
                          RelEntry.Symbol->getName());
     return SymbolIndices[RelEntry.Symbol];
   case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
@@ -994,33 +994,10 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
   SmallVector<WasmExport, 4> Exports;
   SmallVector<std::pair<StringRef, uint32_t>, 4> SymbolFlags;
   SmallVector<std::pair<uint16_t, uint32_t>, 2> InitFuncs;
-  SmallPtrSet<const MCSymbolWasm *, 4> IsAddressTaken;
   unsigned NumFuncImports = 0;
   SmallVector<WasmDataSegment, 4> DataSegments;
   uint32_t DataSize = 0;
 
-  // Populate the IsAddressTaken set.
-  for (const WasmRelocationEntry &RelEntry : CodeRelocations) {
-    switch (RelEntry.Type) {
-    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
-    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB:
-      IsAddressTaken.insert(RelEntry.Symbol);
-      break;
-    default:
-      break;
-    }
-  }
-  for (const WasmRelocationEntry &RelEntry : DataRelocations) {
-    switch (RelEntry.Type) {
-    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
-    case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
-      IsAddressTaken.insert(RelEntry.Symbol);
-      break;
-    default:
-      break;
-    }
-  }
-
   // In the special .global_variables section, we've encoded global
   // variables used by the function. Translate them into the Globals
   // list.
@@ -1116,7 +1093,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       continue;
 
     // If the symbol is not defined in this translation unit, import it.
-    if (!WS.isDefined(/*SetUsed=*/false)) {
+    if (!WS.isDefined(/*SetUsed=*/false) || WS.isVariable()) {
       WasmImport Import;
       Import.ModuleName = WS.getModuleName();
       Import.FieldName = WS.getName();
@@ -1132,8 +1109,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
         Import.IsMutable = false;
         SymbolIndices[&WS] = NumGlobalImports;
 
-        // If this global is the stack pointer, make it mutable and remember it
-        // so that we can emit metadata for it.
+        // If this global is the stack pointer, make it mutable.
         if (WS.getName() == "__stack_pointer")
           Import.IsMutable = true;
 
@@ -1218,14 +1194,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       }
 
       DEBUG(dbgs() << "  -> function index: " << Index << "\n");
-
-      // If needed, prepare the function to be called indirectly.
-      if (IsAddressTaken.count(&WS) != 0) {
-        IndirectSymbolIndices[&WS] = TableElems.size();
-        DEBUG(dbgs() << "  -> adding to table: " << TableElems.size() << "\n");
-        TableElems.push_back(Index);
-      }
-    } else {
+   } else {
       if (WS.isTemporary() && !WS.getSize())
         continue;
 
@@ -1289,7 +1258,6 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
     uint32_t Index = SymbolIndices.find(ResolvedSym)->second;
     DEBUG(dbgs() << "  -> index:" << Index << "\n");
 
-    SymbolIndices[&WS] = Index;
     WasmExport Export;
     Export.FieldName = WS.getName();
     Export.Index = Index;
@@ -1304,12 +1272,34 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       SymbolFlags.emplace_back(WS.getName(), wasm::WASM_SYMBOL_BINDING_LOCAL);
   }
 
-  // Add types for indirect function calls.
-  for (const WasmRelocationEntry &Fixup : CodeRelocations) {
-    if (Fixup.Type != wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB)
-      continue;
+  {
+    auto HandleReloc = [&](const WasmRelocationEntry &Rel) {
+      // Functions referenced by a relocation need to prepared to be called
+      // indirectly.
+      const MCSymbolWasm& WS = *Rel.Symbol;
+      if (WS.isFunction() && IndirectSymbolIndices.count(&WS) == 0) {
+        switch (Rel.Type) {
+        case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+        case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+        case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32:
+        case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: {
+          uint32_t Index = SymbolIndices.find(&WS)->second;
+          IndirectSymbolIndices[&WS] = TableElems.size();
+          DEBUG(dbgs() << "  -> adding to table: " << TableElems.size() << "\n");
+          TableElems.push_back(Index);
+          registerFunctionType(WS);
+          break;
+        }
+        default:
+          break;
+        }
+      }
+    };
 
-    registerFunctionType(*Fixup.Symbol);
+    for (const WasmRelocationEntry &RelEntry : CodeRelocations)
+      HandleReloc(RelEntry);
+    for (const WasmRelocationEntry &RelEntry : DataRelocations)
+      HandleReloc(RelEntry);
   }
 
   // Translate .init_array section contents into start functions.
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index c72a1258c1ee..5906dc5f5307 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp
@@ -138,6 +138,7 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
     default:
       break;
     }
+    break;
   case ELF::EM_BPF:
     switch (Type) {
 #include "llvm/BinaryFormat/ELFRelocs/BPF.def"
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 7a0c05ed8a15..48f98df6f34d 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -303,7 +303,6 @@ Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) {
 
 void WasmObjectFile::populateSymbolTable() {
   // Add imports to symbol table
-  size_t ImportIndex = 0;
   size_t GlobalIndex = 0;
   size_t FunctionIndex = 0;
   for (const wasm::WasmImport& Import : Imports) {
@@ -312,7 +311,7 @@ void WasmObjectFile::populateSymbolTable() {
       assert(Import.Global.Type == wasm::WASM_TYPE_I32);
       SymbolMap.try_emplace(Import.Field, Symbols.size());
       Symbols.emplace_back(Import.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT,
-                           ImportSection, GlobalIndex++, ImportIndex);
+                           ImportSection, GlobalIndex++);
       DEBUG(dbgs() << "Adding import: " << Symbols.back()
                    << " sym index:" << Symbols.size() << "\n");
       break;
@@ -320,14 +319,13 @@ void WasmObjectFile::populateSymbolTable() {
       SymbolMap.try_emplace(Import.Field, Symbols.size());
       Symbols.emplace_back(Import.Field,
                            WasmSymbol::SymbolType::FUNCTION_IMPORT,
-                           ImportSection, FunctionIndex++, ImportIndex);
+                           ImportSection, FunctionIndex++, Import.SigIndex);
       DEBUG(dbgs() << "Adding import: " << Symbols.back()
                    << " sym index:" << Symbols.size() << "\n");
       break;
     default:
       break;
     }
-    ImportIndex++;
   }
 
   // Add exports to symbol table
@@ -338,11 +336,22 @@ void WasmObjectFile::populateSymbolTable() {
           Export.Kind == wasm::WASM_EXTERNAL_FUNCTION
               ? WasmSymbol::SymbolType::FUNCTION_EXPORT
               : WasmSymbol::SymbolType::GLOBAL_EXPORT;
-      SymbolMap.try_emplace(Export.Name, Symbols.size());
-      Symbols.emplace_back(Export.Name, ExportType,
-                           ExportSection, Export.Index);
-      DEBUG(dbgs() << "Adding export: " << Symbols.back()
-                   << " sym index:" << Symbols.size() << "\n");
+      auto Pair = SymbolMap.try_emplace(Export.Name, Symbols.size());
+      if (Pair.second) {
+        Symbols.emplace_back(Export.Name, ExportType,
+                             ExportSection, Export.Index);
+        DEBUG(dbgs() << "Adding export: " << Symbols.back()
+                     << " sym index:" << Symbols.size() << "\n");
+      } else {
+        uint32_t SymIndex = Pair.first->second;
+        const WasmSymbol &OldSym = Symbols[SymIndex];
+        WasmSymbol NewSym(Export.Name, ExportType, ExportSection, Export.Index);
+        NewSym.setAltIndex(OldSym.ElementIndex);
+        Symbols[SymIndex] = NewSym;
+
+        DEBUG(dbgs() << "Replacing existing symbol:  " << NewSym
+                     << " sym index:" << SymIndex << "\n");
+      }
     }
   }
 }
@@ -1017,7 +1026,7 @@ void WasmObjectFile::getRelocationTypeName(
     break;
 
   switch (Rel.Type) {
-#include "llvm/BinaryFormat/WasmRelocs/WebAssembly.def"
+#include "llvm/BinaryFormat/WasmRelocs.def"
   }
 
 #undef WASM_RELOC
diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp
index 9ca584a4a1ae..271224ec6312 100644
--- a/lib/Object/WindowsResource.cpp
+++ b/lib/Object/WindowsResource.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Object/WindowsResource.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include <ctime>
 #include <queue>
@@ -560,10 +561,9 @@ void WindowsResourceCOFFWriter::writeSymbolTable() {
 
   // Now write a symbol for each relocation.
   for (unsigned i = 0; i < Data.size(); i++) {
-    char RelocationName[9];
-    sprintf(RelocationName, "$R%06X", DataOffsets[i]);
+    auto RelocationName = formatv("$R{0:X-6}", i & 0xffffff).sstr<COFF::NameSize>();
     Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
-    strncpy(Symbol->Name.ShortName, RelocationName, (size_t)COFF::NameSize);
+    memcpy(Symbol->Name.ShortName, RelocationName.data(), (size_t) COFF::NameSize);
     Symbol->Value = DataOffsets[i];
     Symbol->SectionNumber = 2;
     Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index 8687f22949a2..b2411395dc0f 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -439,7 +439,7 @@ void ScalarEnumerationTraits<WasmYAML::TableType>::enumeration(
 void ScalarEnumerationTraits<WasmYAML::RelocType>::enumeration(
     IO &IO, WasmYAML::RelocType &Type) {
 #define WASM_RELOC(name, value) IO.enumCase(Type, #name, wasm::name);
-#include "llvm/BinaryFormat/WasmRelocs/WebAssembly.def"
+#include "llvm/BinaryFormat/WasmRelocs.def"
 #undef WASM_RELOC
 }
 
diff --git a/lib/Passes/LLVMBuild.txt b/lib/Passes/LLVMBuild.txt
index 4d8c7f85d3aa..e2378a84328e 100644
--- a/lib/Passes/LLVMBuild.txt
+++ b/lib/Passes/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Passes
 parent = Libraries
-required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation
+required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index f7fb0cef16bf..3489feb93a02 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -2546,12 +2546,12 @@ IEEEFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode) {
 }
 
 bool IEEEFloat::convertFromStringSpecials(StringRef str) {
-  if (str.equals("inf") || str.equals("INFINITY")) {
+  if (str.equals("inf") || str.equals("INFINITY") || str.equals("+Inf")) {
     makeInf(false);
     return true;
   }
 
-  if (str.equals("-inf") || str.equals("-INFINITY")) {
+  if (str.equals("-inf") || str.equals("-INFINITY") || str.equals("-Inf")) {
     makeInf(true);
     return true;
   }
diff --git a/lib/Support/CachePruning.cpp b/lib/Support/CachePruning.cpp
index 3e97c991f504..141573c2a1c7 100644
--- a/lib/Support/CachePruning.cpp
+++ b/lib/Support/CachePruning.cpp
@@ -165,12 +165,14 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
       return false;
     }
   } else {
+    if (!Policy.Interval)
+      return false;
     if (Policy.Interval != seconds(0)) {
       // Check whether the time stamp is older than our pruning interval.
       // If not, do nothing.
       const auto TimeStampModTime = FileStatus.getLastModificationTime();
       auto TimeStampAge = CurrentTime - TimeStampModTime;
-      if (TimeStampAge <= Policy.Interval) {
+      if (TimeStampAge <= *Policy.Interval) {
         DEBUG(dbgs() << "Timestamp file too recent ("
                      << duration_cast<seconds>(TimeStampAge).count()
                      << "s old), do not prune.\n");
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 85e782b2c048..c709fc416df6 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -80,10 +80,12 @@ void *operator new(size_t N, const NamedBufferAlloc &Alloc) {
 
 namespace {
 /// MemoryBufferMem - Named MemoryBuffer pointing to a block of memory.
-class MemoryBufferMem : public MemoryBuffer {
+template<typename MB>
+class MemoryBufferMem : public MB {
 public:
   MemoryBufferMem(StringRef InputData, bool RequiresNullTerminator) {
-    init(InputData.begin(), InputData.end(), RequiresNullTerminator);
+    MemoryBuffer::init(InputData.begin(), InputData.end(),
+                       RequiresNullTerminator);
   }
 
   /// Disable sized deallocation for MemoryBufferMem, because it has
@@ -95,21 +97,22 @@ public:
     return StringRef(reinterpret_cast<const char *>(this + 1));
   }
 
-  BufferKind getBufferKind() const override {
-    return MemoryBuffer_Malloc;
+  MemoryBuffer::BufferKind getBufferKind() const override {
+    return MemoryBuffer::MemoryBuffer_Malloc;
   }
 };
 }
 
-static ErrorOr<std::unique_ptr<MemoryBuffer>>
-getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize, 
+template <typename MB>
+static ErrorOr<std::unique_ptr<MB>>
+getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
            uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile);
 
 std::unique_ptr<MemoryBuffer>
 MemoryBuffer::getMemBuffer(StringRef InputData, StringRef BufferName,
                            bool RequiresNullTerminator) {
   auto *Ret = new (NamedBufferAlloc(BufferName))
-      MemoryBufferMem(InputData, RequiresNullTerminator);
+      MemoryBufferMem<MemoryBuffer>(InputData, RequiresNullTerminator);
   return std::unique_ptr<MemoryBuffer>(Ret);
 }
 
@@ -119,50 +122,30 @@ MemoryBuffer::getMemBuffer(MemoryBufferRef Ref, bool RequiresNullTerminator) {
       Ref.getBuffer(), Ref.getBufferIdentifier(), RequiresNullTerminator));
 }
 
-std::unique_ptr<MemoryBuffer>
-MemoryBuffer::getMemBufferCopy(StringRef InputData, const Twine &BufferName) {
-  std::unique_ptr<MemoryBuffer> Buf =
-      getNewUninitMemBuffer(InputData.size(), BufferName);
+static ErrorOr<std::unique_ptr<WritableMemoryBuffer>>
+getMemBufferCopyImpl(StringRef InputData, const Twine &BufferName) {
+  auto Buf = WritableMemoryBuffer::getNewUninitMemBuffer(InputData.size(), BufferName);
   if (!Buf)
-    return nullptr;
-  memcpy(const_cast<char*>(Buf->getBufferStart()), InputData.data(),
-         InputData.size());
-  return Buf;
+    return make_error_code(errc::not_enough_memory);
+  memcpy(Buf->getBufferStart(), InputData.data(), InputData.size());
+  return std::move(Buf);
 }
 
 std::unique_ptr<MemoryBuffer>
-MemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName) {
-  // Allocate space for the MemoryBuffer, the data and the name. It is important
-  // that MemoryBuffer and data are aligned so PointerIntPair works with them.
-  // TODO: Is 16-byte alignment enough?  We copy small object files with large
-  // alignment expectations into this buffer.
-  SmallString<256> NameBuf;
-  StringRef NameRef = BufferName.toStringRef(NameBuf);
-  size_t AlignedStringLen =
-      alignTo(sizeof(MemoryBufferMem) + NameRef.size() + 1, 16);
-  size_t RealLen = AlignedStringLen + Size + 1;
-  char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
-  if (!Mem)
-    return nullptr;
-
-  // The name is stored after the class itself.
-  CopyStringRef(Mem + sizeof(MemoryBufferMem), NameRef);
-
-  // The buffer begins after the name and must be aligned.
-  char *Buf = Mem + AlignedStringLen;
-  Buf[Size] = 0; // Null terminate buffer.
-
-  auto *Ret = new (Mem) MemoryBufferMem(StringRef(Buf, Size), true);
-  return std::unique_ptr<MemoryBuffer>(Ret);
+MemoryBuffer::getMemBufferCopy(StringRef InputData, const Twine &BufferName) {
+  auto Buf = getMemBufferCopyImpl(InputData, BufferName);
+  if (Buf)
+    return std::move(*Buf);
+  return nullptr;
 }
 
 std::unique_ptr<MemoryBuffer>
 MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
-  std::unique_ptr<MemoryBuffer> SB = getNewUninitMemBuffer(Size, BufferName);
+  auto SB = WritableMemoryBuffer::getNewUninitMemBuffer(Size, BufferName);
   if (!SB)
     return nullptr;
-  memset(const_cast<char*>(SB->getBufferStart()), 0, Size);
-  return SB;
+  memset(SB->getBufferStart(), 0, Size);
+  return std::move(SB);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
@@ -179,10 +162,10 @@ MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize,
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize, 
                            uint64_t Offset, bool IsVolatile) {
-  return getFileAux(FilePath, -1, MapSize, Offset, false, IsVolatile);
+  return getFileAux<MemoryBuffer>(FilePath, -1, MapSize, Offset, false,
+                                  IsVolatile);
 }
 
-
 //===----------------------------------------------------------------------===//
 // MemoryBuffer::getFile implementation.
 //===----------------------------------------------------------------------===//
@@ -191,7 +174,8 @@ namespace {
 /// \brief Memory maps a file descriptor using sys::fs::mapped_file_region.
 ///
 /// This handles converting the offset into a legal offset on the platform.
-class MemoryBufferMMapFile : public MemoryBuffer {
+template<typename MB>
+class MemoryBufferMMapFile : public MB {
   sys::fs::mapped_file_region MFR;
 
   static uint64_t getLegalMapOffset(uint64_t Offset) {
@@ -209,11 +193,13 @@ class MemoryBufferMMapFile : public MemoryBuffer {
 public:
   MemoryBufferMMapFile(bool RequiresNullTerminator, int FD, uint64_t Len,
                        uint64_t Offset, std::error_code &EC)
-      : MFR(FD, sys::fs::mapped_file_region::readonly,
+      : MFR(FD,
+            MB::Writable ? sys::fs::mapped_file_region::priv
+                         : sys::fs::mapped_file_region::readonly,
             getLegalMapSize(Len, Offset), getLegalMapOffset(Offset), EC) {
     if (!EC) {
       const char *Start = getStart(Len, Offset);
-      init(Start, Start + Len, RequiresNullTerminator);
+      MemoryBuffer::init(Start, Start + Len, RequiresNullTerminator);
     }
   }
 
@@ -226,13 +212,13 @@ public:
     return StringRef(reinterpret_cast<const char *>(this + 1));
   }
 
-  BufferKind getBufferKind() const override {
-    return MemoryBuffer_MMap;
+  MemoryBuffer::BufferKind getBufferKind() const override {
+    return MemoryBuffer::MemoryBuffer_MMap;
   }
 };
 }
 
-static ErrorOr<std::unique_ptr<MemoryBuffer>>
+static ErrorOr<std::unique_ptr<WritableMemoryBuffer>>
 getMemoryBufferForStream(int FD, const Twine &BufferName) {
   const ssize_t ChunkSize = 4096*4;
   SmallString<ChunkSize> Buffer;
@@ -246,37 +232,80 @@ getMemoryBufferForStream(int FD, const Twine &BufferName) {
     Buffer.set_size(Buffer.size() + ReadBytes);
   } while (ReadBytes != 0);
 
-  return MemoryBuffer::getMemBufferCopy(Buffer, BufferName);
+  return getMemBufferCopyImpl(Buffer, BufferName);
 }
 
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getFile(const Twine &Filename, int64_t FileSize,
                       bool RequiresNullTerminator, bool IsVolatile) {
-  return getFileAux(Filename, FileSize, FileSize, 0,
-                    RequiresNullTerminator, IsVolatile);
+  return getFileAux<MemoryBuffer>(Filename, FileSize, FileSize, 0,
+                                  RequiresNullTerminator, IsVolatile);
 }
 
-static ErrorOr<std::unique_ptr<MemoryBuffer>>
+template <typename MB>
+static ErrorOr<std::unique_ptr<MB>>
 getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
                 bool IsVolatile);
 
-static ErrorOr<std::unique_ptr<MemoryBuffer>>
+template <typename MB>
+static ErrorOr<std::unique_ptr<MB>>
 getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
            uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile) {
   int FD;
   std::error_code EC = sys::fs::openFileForRead(Filename, FD);
+
   if (EC)
     return EC;
 
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Ret =
-      getOpenFileImpl(FD, Filename, FileSize, MapSize, Offset,
-                      RequiresNullTerminator, IsVolatile);
+  auto Ret = getOpenFileImpl<MB>(FD, Filename, FileSize, MapSize, Offset,
+                                 RequiresNullTerminator, IsVolatile);
   close(FD);
   return Ret;
 }
 
+ErrorOr<std::unique_ptr<WritableMemoryBuffer>>
+WritableMemoryBuffer::getFile(const Twine &Filename, int64_t FileSize,
+                              bool IsVolatile) {
+  return getFileAux<WritableMemoryBuffer>(Filename, FileSize, FileSize, 0,
+                                          /*RequiresNullTerminator*/ false,
+                                          IsVolatile);
+}
+
+ErrorOr<std::unique_ptr<WritableMemoryBuffer>>
+WritableMemoryBuffer::getFileSlice(const Twine &Filename, uint64_t MapSize,
+                                   uint64_t Offset, bool IsVolatile) {
+  return getFileAux<WritableMemoryBuffer>(Filename, -1, MapSize, Offset, false,
+                                          IsVolatile);
+}
+
+std::unique_ptr<WritableMemoryBuffer>
+WritableMemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName) {
+  using MemBuffer = MemoryBufferMem<WritableMemoryBuffer>;
+  // Allocate space for the MemoryBuffer, the data and the name. It is important
+  // that MemoryBuffer and data are aligned so PointerIntPair works with them.
+  // TODO: Is 16-byte alignment enough?  We copy small object files with large
+  // alignment expectations into this buffer.
+  SmallString<256> NameBuf;
+  StringRef NameRef = BufferName.toStringRef(NameBuf);
+  size_t AlignedStringLen = alignTo(sizeof(MemBuffer) + NameRef.size() + 1, 16);
+  size_t RealLen = AlignedStringLen + Size + 1;
+  char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
+  if (!Mem)
+    return nullptr;
+
+  // The name is stored after the class itself.
+  CopyStringRef(Mem + sizeof(MemBuffer), NameRef);
+
+  // The buffer begins after the name and must be aligned.
+  char *Buf = Mem + AlignedStringLen;
+  Buf[Size] = 0; // Null terminate buffer.
+
+  auto *Ret = new (Mem) MemBuffer(StringRef(Buf, Size), true);
+  return std::unique_ptr<WritableMemoryBuffer>(Ret);
+}
+
 static bool shouldUseMmap(int FD,
                           size_t FileSize,
                           size_t MapSize,
@@ -332,7 +361,8 @@ static bool shouldUseMmap(int FD,
   return true;
 }
 
-static ErrorOr<std::unique_ptr<MemoryBuffer>>
+template <typename MB>
+static ErrorOr<std::unique_ptr<MB>>
 getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
                 bool IsVolatile) {
@@ -364,22 +394,21 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
   if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,
                     PageSize, IsVolatile)) {
     std::error_code EC;
-    std::unique_ptr<MemoryBuffer> Result(
-        new (NamedBufferAlloc(Filename))
-        MemoryBufferMMapFile(RequiresNullTerminator, FD, MapSize, Offset, EC));
+    std::unique_ptr<MB> Result(
+        new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile<MB>(
+            RequiresNullTerminator, FD, MapSize, Offset, EC));
     if (!EC)
       return std::move(Result);
   }
 
-  std::unique_ptr<MemoryBuffer> Buf =
-      MemoryBuffer::getNewUninitMemBuffer(MapSize, Filename);
+  auto Buf = WritableMemoryBuffer::getNewUninitMemBuffer(MapSize, Filename);
   if (!Buf) {
     // Failed to create a buffer. The only way it can fail is if
     // new(std::nothrow) returns 0.
     return make_error_code(errc::not_enough_memory);
   }
 
-  char *BufPtr = const_cast<char *>(Buf->getBufferStart());
+  char *BufPtr = Buf.get()->getBufferStart();
 
   size_t BytesLeft = MapSize;
 #ifndef HAVE_PREAD
@@ -412,7 +441,7 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getOpenFile(int FD, const Twine &Filename, uint64_t FileSize,
                           bool RequiresNullTerminator, bool IsVolatile) {
-  return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0,
+  return getOpenFileImpl<MemoryBuffer>(FD, Filename, FileSize, FileSize, 0,
                          RequiresNullTerminator, IsVolatile);
 }
 
@@ -420,7 +449,8 @@ ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize,
                                int64_t Offset, bool IsVolatile) {
   assert(MapSize != uint64_t(-1));
-  return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false, IsVolatile);
+  return getOpenFileImpl<MemoryBuffer>(FD, Filename, -1, MapSize, Offset, false,
+                                       IsVolatile);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>> MemoryBuffer::getSTDIN() {
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index 90992fce0bcc..9ba7a09f9962 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -586,7 +586,7 @@ bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
   APFloat::opStatus Status =
       F.convertFromString(*this, APFloat::rmNearestTiesToEven);
   if (Status != APFloat::opOK) {
-    if (!AllowInexact || Status != APFloat::opInexact)
+    if (!AllowInexact || !(Status & APFloat::opInexact))
       return true;
   }
 
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index c59068cb3550..b96ca084e9bf 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -537,7 +537,7 @@ StringRef llvm::AArch64::getDefaultCPU(StringRef Arch) {
 }
 
 unsigned llvm::AArch64::checkArchVersion(StringRef Arch) {
-  if (Arch[0] == 'v' && std::isdigit(Arch[1]))
+  if (Arch.size() >= 2 && Arch[0] == 'v' && std::isdigit(Arch[1]))
     return (Arch[1] - 48);
   return 0;
 }
@@ -633,7 +633,7 @@ StringRef llvm::ARM::getCanonicalArchName(StringRef Arch) {
   // Only match non-marketing names
   if (offset != StringRef::npos) {
     // Must start with 'vN'.
-    if (A[0] != 'v' || !std::isdigit(A[1]))
+    if (A.size() >= 2 && (A[0] != 'v' || !std::isdigit(A[1])))
       return Error;
     // Can't have an extra 'eb'.
     if (A.find("eb") != StringRef::npos)
@@ -739,7 +739,6 @@ ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) {
   case ARM::ArchKind::ARMV8_2A:
   case ARM::ArchKind::ARMV8_3A:
     return ARM::ProfileKind::A;
-    LLVM_FALLTHROUGH;
   case ARM::ArchKind::ARMV2:
   case ARM::ArchKind::ARMV2A:
   case ARM::ArchKind::ARMV3:
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 05ca40f03018..f8a80ba87873 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -657,7 +657,12 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
       }
       i = j + 1;
     } else if (MustQuote == QuotingType::Double &&
-               !sys::unicode::isPrintable(S[j])) {
+               !sys::unicode::isPrintable(S[j]) && (S[j] & 0x80) == 0) {
+      // If we're double quoting non-printable characters, we prefer printing
+      // them as "\x" + their hex representation. Note that special casing is
+      // needed for UTF-8, where a byte may be part of a UTF-8 sequence and
+      // appear as non-printable, in which case we want to print the correct
+      // unicode character and not its hex representation.
       output(StringRef(&Base[i], j - i)); // "flush"
       output(StringLiteral("\\x"));
 
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 67138f41dda8..2ff2ee347f56 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -583,6 +583,20 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default:
     break;
+  case AArch64::MOVIv2d_ns:
+    // If the target has <rdar://problem/16473581>, lower this
+    // instruction to movi.16b instead.
+    if (STI->hasZeroCycleZeroingFPWorkaround() &&
+        MI->getOperand(1).getImm() == 0) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(AArch64::MOVIv16b_ns);
+      TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+      TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm()));
+      EmitToStreamer(*OutStreamer, TmpInst);
+      return;
+    }
+    break;
+
   case AArch64::DBG_VALUE: {
     if (isVerbose() && OutStreamer->hasRawTextSupport()) {
       SmallString<128> TmpStr;
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index fd1699fd363d..022200986d2b 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -5135,11 +5135,12 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
     return selectAtomicCmpXchg(cast<AtomicCmpXchgInst>(I));
   }
 
-  // fall-back to target-independent instruction selection.
-  return selectOperator(I, I->getOpcode());
   // Silence warnings.
   (void)&CC_AArch64_DarwinPCS_VarArg;
   (void)&CC_AArch64_Win64_VarArg;
+
+  // fall-back to target-independent instruction selection.
+  return selectOperator(I, I->getOpcode());
 }
 
 namespace llvm {
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 73944359223a..d66f7b59a4b5 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -97,6 +97,7 @@
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -335,6 +336,22 @@ bool AArch64FrameLowering::canUseAsPrologue(
   return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
 }
 
+static bool windowsRequiresStackProbe(MachineFunction &MF,
+                                      unsigned StackSizeInBytes) {
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  if (!Subtarget.isTargetWindows())
+    return false;
+  const Function &F = MF.getFunction();
+  // TODO: When implementing stack protectors, take that into account
+  // for the probe threshold.
+  unsigned StackProbeSize = 4096;
+  if (F.hasFnAttribute("stack-probe-size"))
+    F.getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  return StackSizeInBytes >= StackProbeSize;
+}
+
 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
     MachineFunction &MF, unsigned StackBumpBytes) const {
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -347,7 +364,7 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
 
   // 512 is the maximum immediate for stp/ldp that will be used for
   // callee-save save/restores
-  if (StackBumpBytes >= 512)
+  if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
     return false;
 
   if (MFI.hasVarSizedObjects())
@@ -478,7 +495,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     return;
 
   int NumBytes = (int)MFI.getStackSize();
-  if (!AFI->hasStackFrame()) {
+  if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
     assert(!HasFP && "unexpected function without stack frame but with FP");
 
     // All of the stack allocation is for locals.
@@ -550,6 +567,44 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                     MachineInstr::FrameSetup);
   }
 
+  if (windowsRequiresStackProbe(MF, NumBytes)) {
+    uint32_t NumWords = NumBytes >> 4;
+
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
+        .addImm(NumWords)
+        .setMIFlags(MachineInstr::FrameSetup);
+
+    switch (MF.getTarget().getCodeModel()) {
+    case CodeModel::Small:
+    case CodeModel::Medium:
+    case CodeModel::Kernel:
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+          .addExternalSymbol("__chkstk")
+          .addReg(AArch64::X15, RegState::Implicit)
+          .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    case CodeModel::Large:
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
+          .addReg(AArch64::X16, RegState::Define)
+          .addExternalSymbol("__chkstk")
+          .addExternalSymbol("__chkstk")
+          .setMIFlags(MachineInstr::FrameSetup);
+
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR))
+          .addReg(AArch64::X16, RegState::Kill)
+          .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
+          .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    }
+
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
+        .addReg(AArch64::SP, RegState::Kill)
+        .addReg(AArch64::X15, RegState::Kill)
+        .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
+        .setMIFlags(MachineInstr::FrameSetup);
+    NumBytes = 0;
+  }
+
   // Allocate space for the rest of the frame.
   if (NumBytes) {
     const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
@@ -1164,18 +1219,32 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   unsigned UnspilledCSGPR = AArch64::NoRegister;
   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
 
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+
+  unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
+                                ? RegInfo->getBaseRegister()
+                                : (unsigned)AArch64::NoRegister;
+
+  unsigned SpillEstimate = SavedRegs.count();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    unsigned PairedReg = CSRegs[i ^ 1];
+    if (Reg == BasePointerReg)
+      SpillEstimate++;
+    if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg))
+      SpillEstimate++;
+  }
+  SpillEstimate += 2; // Conservatively include FP+LR in the estimate
+  unsigned StackEstimate = MFI.estimateStackSize(MF) + 8 * SpillEstimate;
+
   // The frame record needs to be created by saving the appropriate registers
-  if (hasFP(MF)) {
+  if (hasFP(MF) || windowsRequiresStackProbe(MF, StackEstimate)) {
     SavedRegs.set(AArch64::FP);
     SavedRegs.set(AArch64::LR);
   }
 
-  unsigned BasePointerReg = AArch64::NoRegister;
-  if (RegInfo->hasBasePointer(MF))
-    BasePointerReg = RegInfo->getBaseRegister();
-
   unsigned ExtraCSSpill = 0;
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   // Figure out which callee-saved registers to save/restore.
   for (unsigned i = 0; CSRegs[i]; ++i) {
     const unsigned Reg = CSRegs[i];
@@ -1217,7 +1286,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // The CSR spill slots have not been allocated yet, so estimateStackSize
   // won't include them.
-  MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
   DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
   unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1242cf5be188..6f7b2b6fd5b5 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -470,10 +470,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   if (Subtarget->hasPerfMon())
     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
 
-  if (Subtarget->isTargetMachO()) {
-    // For iOS, we don't want to the normal expansion of a libcall to
-    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
-    // traffic.
+  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
+      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
+    // Issue __sincos_stret if available.
     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   } else {
@@ -2328,8 +2327,9 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
-  const char *LibcallName =
-      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
+                                        : RTLIB::SINCOS_STRET_F32;
+  const char *LibcallName = getLibcallName(LC);
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index c7c560a81328..abbba7d1d5a9 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4963,16 +4963,9 @@ void AArch64InstrInfo::insertOutlinerEpilogue(
     MachineBasicBlock &MBB, MachineFunction &MF,
     const MachineOutlinerInfo &MInfo) const {
 
-  bool ContainsCalls = false;
-
-  for (MachineInstr &MI : MBB) {
-    if (MI.isCall()) {
-      ContainsCalls = true;
-      break;
-    }
-  }
-
-  if (ContainsCalls) {
+  // Is there a call in the outlined range?
+  if (std::any_of(MBB.instr_begin(), MBB.instr_end(),
+                  [](MachineInstr &MI) { return MI.isCall(); })) {
     // Fix up the instructions in the range, since we're going to modify the
     // stack.
     fixupPostOutline(MBB);
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 7f5507371fa0..a719d47618e5 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -25,11 +25,11 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
   ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
-  const char *bzeroEntry =
-      (V && V->isNullValue()) ? STI.getBZeroEntry() : nullptr;
+  const char *bzeroName = (V && V->isNullValue())
+      ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) : nullptr;
   // For small size (< 256), it is not beneficial to use bzero
   // instead of memset.
-  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+  if (bzeroName && (!SizeValue || SizeValue->getZExtValue() > 256)) {
     const AArch64TargetLowering &TLI = *STI.getTargetLowering();
 
     EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
@@ -45,7 +45,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     CLI.setDebugLoc(dl)
         .setChain(Chain)
         .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                      DAG.getExternalSymbol(bzeroEntry, IntPtr),
+                      DAG.getExternalSymbol(bzeroName, IntPtr),
                       std::move(Args))
         .setDiscardResult();
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index e397d585ae77..688bb936d0ca 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -217,19 +217,6 @@ unsigned char AArch64Subtarget::classifyGlobalFunctionReference(
   return AArch64II::MO_NO_FLAG;
 }
 
-/// This function returns the name of a function which has an interface
-/// like the non-standard bzero function, if such a function exists on
-/// the current subtarget and it is considered prefereable over
-/// memset with zero passed as the second argument. Otherwise it
-/// returns null.
-const char *AArch64Subtarget::getBZeroEntry() const {
-  // Prefer bzero on Darwin only.
-  if(isTargetDarwin())
-    return "bzero";
-
-  return nullptr;
-}
-
 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
                                            unsigned NumRegionInstrs) const {
   // LNT run (at least on Cyclone) showed reasonably significant gains for
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 5d9759d363dd..9245b2f396b7 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -309,13 +309,6 @@ public:
   unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
                                                 const TargetMachine &TM) const;
 
-  /// This function returns the name of a function which has an interface
-  /// like the non-standard bzero function, if such a function exists on
-  /// the current subtarget and it is considered prefereable over
-  /// memset with zero passed as the second argument. Otherwise it
-  /// returns null.
-  const char *getBZeroEntry() const;
-
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
index df939add70fa..66b7e02ceb99 100644
--- a/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -322,6 +322,9 @@ def : ROSysReg<"PMCEID0_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b110>;
 def : ROSysReg<"PMCEID1_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b111>;
 def : ROSysReg<"MIDR_EL1",           0b11, 0b000, 0b0000, 0b0000, 0b000>;
 def : ROSysReg<"CCSIDR_EL1",         0b11, 0b001, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CCSIDR2_EL1",        0b11, 0b001, 0b0000, 0b0000, 0b010> {
+  let Requires = [{ {AArch64::HasV8_3aOps} }];
+}
 def : ROSysReg<"CLIDR_EL1",          0b11, 0b001, 0b0000, 0b0000, 0b001>;
 def : ROSysReg<"CTR_EL0",            0b11, 0b011, 0b0000, 0b0000, 0b001>;
 def : ROSysReg<"MPIDR_EL1",          0b11, 0b000, 0b0000, 0b0000, 0b101>;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 64583ead73f2..0e6ad944c141 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -346,10 +346,9 @@ public:
 
 } // end anonymous namespace
 
-TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(AArch64TTIImpl(this, F));
-  });
+TargetTransformInfo
+AArch64TargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(AArch64TTIImpl(this, F));
 }
 
 TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 2bbfb2da3db6..8d28a5e30ebf 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -44,8 +44,7 @@ public:
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  /// \brief Get the TargetIRAnalysis for this target.
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   TargetLoweringObjectFile* getObjFileLowering() const override {
     return TLOF.get();
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index aeffbd70fc81..6e63783e5646 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1975,10 +1975,6 @@ static bool isValidSVEKind(StringRef Name) {
       .Default(false);
 }
 
-static bool isSVERegister(StringRef Name) {
-  return Name[0] == 'z' || Name[0] == 'p';
-}
-
 static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
                                  char &ElementKind) {
   assert(isValidVectorKind(Name));
@@ -2008,21 +2004,19 @@ bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
 // Matches a register name or register alias previously defined by '.req'
 unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
                                                   RegKind Kind) {
-  unsigned RegNum;
-  switch (Kind) {
-  case RegKind::Scalar:
-    RegNum = MatchRegisterName(Name);
-    break;
-  case RegKind::NeonVector:
-    RegNum = MatchNeonVectorRegName(Name);
-    break;
-  case RegKind::SVEDataVector:
-    RegNum = matchSVEDataVectorRegName(Name);
-    break;
-  case RegKind::SVEPredicateVector:
-    RegNum = matchSVEPredicateVectorRegName(Name);
-    break;
-  }
+  unsigned RegNum = 0;
+  if ((RegNum = matchSVEDataVectorRegName(Name)))
+    return Kind == RegKind::SVEDataVector ? RegNum : 0;
+
+  if ((RegNum = matchSVEPredicateVectorRegName(Name)))
+    return Kind == RegKind::SVEPredicateVector ? RegNum : 0;
+
+  if ((RegNum = MatchNeonVectorRegName(Name)))
+    return Kind == RegKind::NeonVector ? RegNum : 0;
+
+  // The parsed register must be of RegKind Scalar
+  if ((RegNum = MatchRegisterName(Name)))
+    return Kind == RegKind::Scalar ? RegNum : 0;
 
   if (!RegNum) {
     // Check for aliases registered via .req. Canonicalize to lower case.
@@ -2049,10 +2043,8 @@ int AArch64AsmParser::tryParseRegister() {
     return -1;
 
   std::string lowerCase = Tok.getString().lower();
-  if (isSVERegister(lowerCase))
-    return -1;
-
   unsigned RegNum = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
+
   // Also handle a few aliases of registers.
   if (RegNum == 0)
     RegNum = StringSwitch<unsigned>(lowerCase)
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index bb628b8c558f..fda6252f46e3 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -695,18 +695,24 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           IsSGPR = false;
           Width = 3;
         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
+          assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
+            "trap handler registers should not be used");
           IsSGPR = true;
           Width = 4;
         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 4;
         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
+          assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
+            "trap handler registers should not be used");
           IsSGPR = true;
           Width = 8;
         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 8;
         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
+          assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
+            "trap handler registers should not be used");
           IsSGPR = true;
           Width = 16;
         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 3f8a9b1964ca..5c31bddd9b1a 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -202,6 +202,16 @@ public:
 
   const char* getTargetNodeName(unsigned Opcode) const override;
 
+  // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection
+  // for AMDGPU.
+  // A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036
+  // 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on
+  // MergeConsecutiveStores() before Instruction Selection for all targets.
+  // Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores()
+  // merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores()
+  // re-merges, etc. ) to warrant turning it off for now.
+  bool mergeStoresAfterLegalization() const override { return false; }
+
   bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
     return true;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6984f4e71613..2042dbf6d5e2 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -571,10 +571,9 @@ public:
 
 } // end anonymous namespace
 
-TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(AMDGPUTTIImpl(this, F));
-  });
+TargetTransformInfo
+AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(AMDGPUTTIImpl(this, F));
 }
 
 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
@@ -898,4 +897,3 @@ void GCNPassConfig::addPreEmitPass() {
 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new GCNPassConfig(*this, PM);
 }
-
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 5043e31f6f5b..5f9b2a7fca20 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -55,7 +55,7 @@ public:
   const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
     return &IntrinsicInfo;
   }
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2acd7f78faea..ebf656c549ec 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -536,6 +536,10 @@ public:
     return EndLoc;
   }
 
+  SMRange getLocRange() const {
+    return SMRange(StartLoc, EndLoc);
+  }
+
   Modifiers getModifiers() const {
     assert(isRegKind() || isImmTy(ImmTyNone));
     return isRegKind() ? Reg.Mods : Imm.Mods;
@@ -1491,6 +1495,8 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
       case 1: return AMDGPU::TTMP_32RegClassID;
       case 2: return AMDGPU::TTMP_64RegClassID;
       case 4: return AMDGPU::TTMP_128RegClassID;
+      case 8: return AMDGPU::TTMP_256RegClassID;
+      case 16: return AMDGPU::TTMP_512RegClassID;
     }
   } else if (Is == IS_SGPR) {
     switch (RegWidth) {
@@ -1498,8 +1504,8 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
       case 1: return AMDGPU::SGPR_32RegClassID;
       case 2: return AMDGPU::SGPR_64RegClassID;
       case 4: return AMDGPU::SGPR_128RegClassID;
-      case 8: return AMDGPU::SReg_256RegClassID;
-      case 16: return AMDGPU::SReg_512RegClassID;
+      case 8: return AMDGPU::SGPR_256RegClassID;
+      case 16: return AMDGPU::SGPR_512RegClassID;
     }
   }
   return -1;
@@ -1754,6 +1760,11 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) {
   // TODO: add syntactic sugar for 1/(2*PI)
   bool Minus = false;
   if (getLexer().getKind() == AsmToken::Minus) {
+    const AsmToken NextToken = getLexer().peekTok();
+    if (!NextToken.is(AsmToken::Integer) &&
+        !NextToken.is(AsmToken::Real)) {
+        return MatchOperand_NoMatch;
+    }
     Minus = true;
     Parser.Lex();
   }
@@ -1783,7 +1794,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) {
     return MatchOperand_Success;
   }
   default:
-    return Minus ? MatchOperand_ParseFail : MatchOperand_NoMatch;
+    return MatchOperand_NoMatch;
   }
 }
 
@@ -2244,6 +2255,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   return true;
 }
 
+static std::string AMDGPUMnemonicSpellCheck(StringRef S, uint64_t FBS,
+                                            unsigned VariantID = 0);
+
 bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               OperandVector &Operands,
                                               MCStreamer &Out,
@@ -2286,8 +2300,13 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_MissingFeature:
     return Error(IDLoc, "instruction not supported on this GPU");
 
-  case Match_MnemonicFail:
-    return Error(IDLoc, "unrecognized instruction mnemonic");
+  case Match_MnemonicFail: {
+    uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+    std::string Suggestion = AMDGPUMnemonicSpellCheck(
+        ((AMDGPUOperand &)*Operands[0]).getToken(), FBS);
+    return Error(IDLoc, "invalid instruction" + Suggestion,
+                 ((AMDGPUOperand &)*Operands[0]).getLocRange());
+  }
 
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
@@ -3838,7 +3857,9 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
 
     return Ok? MatchOperand_Success : MatchOperand_ParseFail;
   } else {
-    return MatchOperand_NoMatch;
+    // Swizzle "offset" operand is optional.
+    // If it is omitted, try parsing other optional operands.
+    return parseOptionalOperand(Operands);
   }
 }
 
@@ -4786,6 +4807,7 @@ extern "C" void LLVMInitializeAMDGPUAsmParser() {
 
 #define GET_REGISTER_MATCHER
 #define GET_MATCHER_IMPLEMENTATION
+#define GET_MNEMONIC_SPELL_CHECKER
 #include "AMDGPUGenAsmMatcher.inc"
 
 // This fuction should be defined after auto-generated include so that we have
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 4a3f2c975179..47a2d3f2fdc5 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -348,10 +348,12 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
   case AMDGPU::TTMP_128RegClassID:
   // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
   // this bundle?
-  case AMDGPU::SReg_256RegClassID:
-  // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
+  case AMDGPU::SGPR_256RegClassID:
+  case AMDGPU::TTMP_256RegClassID:
+    // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
   // this bundle?
-  case AMDGPU::SReg_512RegClassID:
+  case AMDGPU::SGPR_512RegClassID:
+  case AMDGPU::TTMP_512RegClassID:
     shift = 2;
     break;
   // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
@@ -441,11 +443,11 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const {
 }
 
 MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const {
-  return createSRegOperand(AMDGPU::SReg_256RegClassID, Val);
+  return decodeDstOp(OPW256, Val);
 }
 
 MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
-  return createSRegOperand(AMDGPU::SReg_512RegClassID, Val);
+  return decodeDstOp(OPW512, Val);
 }
 
 MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
@@ -593,6 +595,8 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
     return SGPR_32RegClassID;
   case OPW64: return SGPR_64RegClassID;
   case OPW128: return SGPR_128RegClassID;
+  case OPW256: return SGPR_256RegClassID;
+  case OPW512: return SGPR_512RegClassID;
   }
 }
 
@@ -608,6 +612,8 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
     return TTMP_32RegClassID;
   case OPW64: return TTMP_64RegClassID;
   case OPW128: return TTMP_128RegClassID;
+  case OPW256: return TTMP_256RegClassID;
+  case OPW512: return TTMP_512RegClassID;
   }
 }
 
@@ -659,6 +665,25 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
   }
 }
 
+MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) const {
+  using namespace AMDGPU::EncValues;
+
+  assert(Val < 128);
+  assert(Width == OPW256 || Width == OPW512);
+
+  if (Val <= SGPR_MAX) {
+    assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
+    return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
+  }
+
+  int TTmpIdx = getTTmpIdx(Val);
+  if (TTmpIdx >= 0) {
+    return createSRegOperand(getTtmpClassId(Width), TTmpIdx);
+  }
+
+  llvm_unreachable("unknown dst register");
+}
+
 MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
   using namespace AMDGPU;
 
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index ce396eb68c4c..75cfc5e11282 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -95,6 +95,8 @@ public:
     OPW32,
     OPW64,
     OPW128,
+    OPW256,
+    OPW512,
     OPW16,
     OPWV216,
     OPW_LAST_,
@@ -110,6 +112,7 @@ public:
   MCOperand decodeLiteralConstant() const;
 
   MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
+  MCOperand decodeDstOp(const OpWidthTy Width, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;
   MCOperand decodeSpecialReg64(unsigned Val) const;
 
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index 67663d39967c..bf57f88bef91 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -335,13 +335,13 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) {
     O << 'v';
     NumRegs = 8;
-  } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(RegNo)) {
+  } else if (MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) {
     O << 's';
     NumRegs = 8;
   } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) {
     O << 'v';
     NumRegs = 16;
-  } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(RegNo)) {
+  } else if (MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo)) {
     O << 's';
     NumRegs = 16;
   } else {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index 6b7c3ffb7bb8..dd0efef7f91b 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -8,6 +8,26 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
+//  Helpers
+//===----------------------------------------------------------------------===//
+
+class getSubRegs<int size> {
+  list<SubRegIndex> ret2 = [sub0, sub1];
+  list<SubRegIndex> ret3 = [sub0, sub1, sub2];
+  list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3];
+  list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
+  list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
+                             sub4, sub5, sub6, sub7,
+                             sub8, sub9, sub10, sub11,
+                             sub12, sub13, sub14, sub15];
+
+  list<SubRegIndex> ret = !if(!eq(size, 2), ret2,
+                              !if(!eq(size, 3), ret3,
+                                  !if(!eq(size, 4), ret4,
+                                      !if(!eq(size, 8), ret8, ret16))));
+}
+
+//===----------------------------------------------------------------------===//
 //  Declarations that describe the SI registers
 //===----------------------------------------------------------------------===//
 class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
@@ -141,19 +161,19 @@ def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
 }
 
 // SGPR 64-bit registers
-def SGPR_64Regs : RegisterTuples<[sub0, sub1],
+def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret,
                              [(add (decimate SGPR_32, 2)),
                               (add (decimate (shl SGPR_32, 1), 2))]>;
 
 // SGPR 128-bit registers
-def SGPR_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
+def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret,
                               [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
                                (add (decimate (shl SGPR_32, 3), 4))]>;
 
 // SGPR 256-bit registers
-def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret,
                               [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
@@ -164,8 +184,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
                                (add (decimate (shl SGPR_32, 7), 4))]>;
 
 // SGPR 512-bit registers
-def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
-                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret,
                               [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
@@ -190,47 +209,125 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
 }
 
 // Trap handler TMP 64-bit registers
-def TTMP_64Regs : RegisterTuples<[sub0, sub1],
+def TTMP_64Regs : RegisterTuples<getSubRegs<2>.ret,
                              [(add (decimate TTMP_32, 2)),
                               (add (decimate (shl TTMP_32, 1), 2))]>;
 
 // Trap handler TMP 128-bit registers
-def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
+def TTMP_128Regs : RegisterTuples<getSubRegs<4>.ret,
                               [(add (decimate TTMP_32, 4)),
                                (add (decimate (shl TTMP_32, 1), 4)),
                                (add (decimate (shl TTMP_32, 2), 4)),
                                (add (decimate (shl TTMP_32, 3), 4))]>;
 
-class TmpRegTuples <string tgt,
-                    bit Is64Bit,
-                    int Index0,
-                    int Index1 = !add(Index0, 1),
-                    int Index2 = !add(Index0, !if(Is64Bit, 1, 2)),
-                    int Index3 = !add(Index0, !if(Is64Bit, 1, 3)),
-                    string name = "ttmp["#Index0#":"#Index3#"]",
-                    Register r0 = !cast<Register>("TTMP"#Index0#tgt),
-                    Register r1 = !cast<Register>("TTMP"#Index1#tgt),
-                    Register r2 = !cast<Register>("TTMP"#Index2#tgt),
-                    Register r3 = !cast<Register>("TTMP"#Index3#tgt)> :
-  RegisterWithSubRegs<name, !if(Is64Bit, [r0, r1], [r0, r1, r2, r3])> {
-  let SubRegIndices = !if(Is64Bit, [sub0, sub1], [sub0, sub1, sub2, sub3]);
-  let HWEncoding = r0.HWEncoding;
-}
+def TTMP_256Regs : RegisterTuples<getSubRegs<8>.ret,
+                              [(add (decimate TTMP_32, 4)),
+                               (add (decimate (shl TTMP_32, 1), 4)),
+                               (add (decimate (shl TTMP_32, 2), 4)),
+                               (add (decimate (shl TTMP_32, 3), 4)),
+                               (add (decimate (shl TTMP_32, 4), 4)),
+                               (add (decimate (shl TTMP_32, 5), 4)),
+                               (add (decimate (shl TTMP_32, 6), 4)),
+                               (add (decimate (shl TTMP_32, 7), 4))]>;
+
+def TTMP_512Regs : RegisterTuples<getSubRegs<16>.ret,
+                              [(add (decimate TTMP_32, 4)),
+                               (add (decimate (shl TTMP_32, 1), 4)),
+                               (add (decimate (shl TTMP_32, 2), 4)),
+                               (add (decimate (shl TTMP_32, 3), 4)),
+                               (add (decimate (shl TTMP_32, 4), 4)),
+                               (add (decimate (shl TTMP_32, 5), 4)),
+                               (add (decimate (shl TTMP_32, 6), 4)),
+                               (add (decimate (shl TTMP_32, 7), 4)),
+                               (add (decimate (shl TTMP_32, 8), 4)),
+                               (add (decimate (shl TTMP_32, 9), 4)),
+                               (add (decimate (shl TTMP_32, 10), 4)),
+                               (add (decimate (shl TTMP_32, 11), 4)),
+                               (add (decimate (shl TTMP_32, 12), 4)),
+                               (add (decimate (shl TTMP_32, 13), 4)),
+                               (add (decimate (shl TTMP_32, 14), 4)),
+                               (add (decimate (shl TTMP_32, 15), 4))]>;
+
+class TmpRegTuplesBase<int index, int size,
+                       list<Register> subRegs,
+                       list<SubRegIndex> indices = getSubRegs<size>.ret,
+                       int index1 = !add(index, !add(size, -1)),
+                       string name = "ttmp["#index#":"#index1#"]"> :
+  RegisterWithSubRegs<name, subRegs> {
+  let HWEncoding = subRegs[0].HWEncoding;
+  let SubRegIndices = indices;
+}
+
+class TmpRegTuples<string tgt,
+                   int size,
+                   int index0,
+                   int index1 = !add(index0, 1),
+                   int index2 = !add(index0, !if(!eq(size, 2), 1, 2)),
+                   int index3 = !add(index0, !if(!eq(size, 2), 1, 3)),
+                   int index4 = !add(index0, !if(!eq(size, 8), 4, 1)),
+                   int index5 = !add(index0, !if(!eq(size, 8), 5, 1)),
+                   int index6 = !add(index0, !if(!eq(size, 8), 6, 1)),
+                   int index7 = !add(index0, !if(!eq(size, 8), 7, 1)),
+                   Register r0 = !cast<Register>("TTMP"#index0#tgt),
+                   Register r1 = !cast<Register>("TTMP"#index1#tgt),
+                   Register r2 = !cast<Register>("TTMP"#index2#tgt),
+                   Register r3 = !cast<Register>("TTMP"#index3#tgt),
+                   Register r4 = !cast<Register>("TTMP"#index4#tgt),
+                   Register r5 = !cast<Register>("TTMP"#index5#tgt),
+                   Register r6 = !cast<Register>("TTMP"#index6#tgt),
+                   Register r7 = !cast<Register>("TTMP"#index7#tgt)> :
+  TmpRegTuplesBase<index0, size,
+                   !if(!eq(size, 2), [r0, r1],
+                       !if(!eq(size, 4), [r0, r1, r2, r3],
+                                         [r0, r1, r2, r3, r4, r5, r6, r7])),
+                   getSubRegs<size>.ret>;
 
 foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in {
-  def TTMP#Index#_TTMP#!add(Index,1)#_vi   : TmpRegTuples<"_vi",   1, Index>;
-  def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 1, Index>;
+  def TTMP#Index#_TTMP#!add(Index,1)#_vi   : TmpRegTuples<"_vi",   2, Index>;
+  def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>;
 }
 
 foreach Index = {0, 4, 8, 12} in {
   def TTMP#Index#_TTMP#!add(Index,1)#
                  _TTMP#!add(Index,2)#
-                 _TTMP#!add(Index,3)#_vi   : TmpRegTuples<"_vi",   0, Index>;
+                 _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi",   4, Index>;
   def TTMP#Index#_TTMP#!add(Index,1)#
                  _TTMP#!add(Index,2)#
-                 _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 0, Index>;
+                 _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>;
 }
 
+foreach Index = {0, 4, 8} in {
+  def TTMP#Index#_TTMP#!add(Index,1)#
+                 _TTMP#!add(Index,2)#
+                 _TTMP#!add(Index,3)#
+                 _TTMP#!add(Index,4)#
+                 _TTMP#!add(Index,5)#
+                 _TTMP#!add(Index,6)#
+                 _TTMP#!add(Index,7)#_vi : TmpRegTuples<"_vi",   8, Index>;
+  def TTMP#Index#_TTMP#!add(Index,1)#
+                 _TTMP#!add(Index,2)#
+                 _TTMP#!add(Index,3)#
+                 _TTMP#!add(Index,4)#
+                 _TTMP#!add(Index,5)#
+                 _TTMP#!add(Index,6)#
+                 _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>;
+}
+
+def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi :
+  TmpRegTuplesBase<0, 16,
+                   [TTMP0_vi, TTMP1_vi, TTMP2_vi, TTMP3_vi,
+                    TTMP4_vi, TTMP5_vi, TTMP6_vi, TTMP7_vi,
+                    TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi,
+                    TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>;
+
+def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 :
+  TmpRegTuplesBase<0, 16,
+                   [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9,
+                    TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9,
+                    TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9,
+                    TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>;
+
+
 // VGPR 32-bit registers
 // i16/f16 only on VI+
 def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
@@ -240,25 +337,25 @@ def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
 }
 
 // VGPR 64-bit registers
-def VGPR_64 : RegisterTuples<[sub0, sub1],
+def VGPR_64 : RegisterTuples<getSubRegs<2>.ret,
                              [(add (trunc VGPR_32, 255)),
                               (add (shl VGPR_32, 1))]>;
 
 // VGPR 96-bit registers
-def VGPR_96 : RegisterTuples<[sub0, sub1, sub2],
+def VGPR_96 : RegisterTuples<getSubRegs<3>.ret,
                              [(add (trunc VGPR_32, 254)),
                               (add (shl VGPR_32, 1)),
                               (add (shl VGPR_32, 2))]>;
 
 // VGPR 128-bit registers
-def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+def VGPR_128 : RegisterTuples<getSubRegs<4>.ret,
                               [(add (trunc VGPR_32, 253)),
                                (add (shl VGPR_32, 1)),
                                (add (shl VGPR_32, 2)),
                                (add (shl VGPR_32, 3))]>;
 
 // VGPR 256-bit registers
-def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+def VGPR_256 : RegisterTuples<getSubRegs<8>.ret,
                               [(add (trunc VGPR_32, 249)),
                                (add (shl VGPR_32, 1)),
                                (add (shl VGPR_32, 2)),
@@ -269,8 +366,7 @@ def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
                                (add (shl VGPR_32, 7))]>;
 
 // VGPR 512-bit registers
-def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
-                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+def VGPR_512 : RegisterTuples<getSubRegs<16>.ret,
                               [(add (trunc VGPR_32, 241)),
                                (add (shl VGPR_32, 1)),
                                (add (shl VGPR_32, 2)),
@@ -368,13 +464,31 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,
 
 } // End CopyCost = 2
 
-def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
+def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> {
+  let AllocationPriority = 11;
+}
+
+def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
+  let isAllocatable = 0;
+}
+
+def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
+  (add SGPR_256, TTMP_256)> {
   // Requires 4 s_mov_b64 to copy
   let CopyCost = 4;
   let AllocationPriority = 11;
 }
 
-def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512)> {
+def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> {
+  let AllocationPriority = 12;
+}
+
+def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add TTMP_512Regs)> {
+  let isAllocatable = 0;
+}
+
+def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+  (add SGPR_512, TTMP_512)> {
   // Requires 8 s_mov_b64 to copy
   let CopyCost = 8;
   let AllocationPriority = 12;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 819a7add0be4..125a3b22d0cf 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -667,6 +667,10 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
   CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \
   CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \
   CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \
+  CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
+  CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
+  CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
+  CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
   }
 
 #define CASE_CI_VI(node) \
diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp
index d2512c281a61..1acae3a88870 100644
--- a/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/lib/Target/ARC/ARCTargetMachine.cpp
@@ -88,8 +88,7 @@ extern "C" void LLVMInitializeARCTarget() {
   RegisterTargetMachine<ARCTargetMachine> X(getTheARCTarget());
 }
 
-TargetIRAnalysis ARCTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(ARCTTIImpl(this, F));
-  });
+TargetTransformInfo
+ARCTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(ARCTTIImpl(this, F));
 }
diff --git a/lib/Target/ARC/ARCTargetMachine.h b/lib/Target/ARC/ARCTargetMachine.h
index 98021b3dc1d5..18117e3409af 100644
--- a/lib/Target/ARC/ARCTargetMachine.h
+++ b/lib/Target/ARC/ARCTargetMachine.h
@@ -40,7 +40,7 @@ public:
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 3aac689c6310..9ffb4c2055f9 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -61,6 +61,7 @@ void initializeARMLoadStoreOptPass(PassRegistry &);
 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
 void initializeARMConstantIslandsPass(PassRegistry &);
 void initializeARMExpandPseudoPass(PassRegistry &);
+void initializeThumb2SizeReducePass(PassRegistry &);
 
 } // end namespace llvm
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index c1a3f639461d..c9766aa2161a 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -83,6 +83,9 @@ def FeatureDB             : SubtargetFeature<"db", "HasDataBarrier", "true",
 def FeatureV7Clrex        : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
                                              "Has v7 clrex instruction">;
 
+def FeatureDFB  : SubtargetFeature<"dfb", "HasFullDataBarrier", "true",
+                                   "Has full data barrier (dfb) instruction">;
+
 def FeatureAcquireRelease : SubtargetFeature<"acquire-release",
                                              "HasAcquireRelease", "true",
                                              "Has v8 acquire/release (lda/ldaex "
@@ -617,6 +620,7 @@ def ARMv83a   : Architecture<"armv8.3-a", "ARMv83a",  [HasV8_3aOps,
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
                                                        FeatureDB,
+                                                       FeatureDFB,
                                                        FeatureDSP,
                                                        FeatureCRC,
                                                        FeatureMP,
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 0ea435062ec0..60048d4453d8 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -1416,7 +1416,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
     case MVT::i8:
     case MVT::i16:
       needsExt = true;
-    // Intentional fall-through.
+      LLVM_FALLTHROUGH;
     case MVT::i32:
       if (isThumb2) {
         if (!UseImm)
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 1b4d7ff50848..aeda7c06a27a 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1041,7 +1041,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (!Subtarget->isThumb1Only())
     setOperationAction(ISD::SETCCE, MVT::i32, Custom);
 
-  setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND,    MVT::Other, Custom);
   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
@@ -1084,20 +1084,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     }
   }
 
-  // Combine sin / cos into one node or libcall if possible.
-  if (Subtarget->hasSinCos()) {
-    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
-    setLibcallName(RTLIB::SINCOS_F64, "sincos");
-    if (Subtarget->isTargetWatchABI()) {
-      setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
-      setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
-    }
-    if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) {
-      // For iOS, we don't want to the normal expansion of a libcall to
-      // sincos. We want to issue a libcall to __sincos_stret.
-      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-    }
+  // Use __sincos_stret if available.
+  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
+      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
+    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   }
 
   // FP-ARMv8 implements a lot of rounding-like FP operations.
@@ -1255,6 +1246,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::CMOV:          return "ARMISD::CMOV";
 
   case ARMISD::SSAT:          return "ARMISD::SSAT";
+  case ARMISD::USAT:          return "ARMISD::USAT";
 
   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
@@ -3902,6 +3894,10 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
 }
 
+// This function returns three things: the arithmetic computation itself
+// (Value), a comparison (OverflowCmp), and a condition code (ARMcc).  The
+// comparison and the condition code define the case in which the arithmetic
+// computation *does not* overflow.
 std::pair<SDValue, SDValue>
 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
                                  SDValue &ARMcc) const {
@@ -3927,7 +3923,11 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
     break;
   case ISD::UADDO:
     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
-    Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
+    // We use ADDC here to correspond to its use in LowerUnsignedALUO.
+    // We do not use it in the USUBO case as Value may not be used.
+    Value = DAG.getNode(ARMISD::ADDC, dl,
+                        DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
+                .getValue(0);
     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
     break;
   case ISD::SSUBO:
@@ -4205,7 +4205,7 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
 }
 
-// Check if two chained conditionals could be converted into SSAT.
+// Check if two chained conditionals could be converted into SSAT or USAT.
 //
 // SSAT can replace a set of two conditional selectors that bound a number to an
 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
@@ -4216,10 +4216,14 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
 //     x < k ? (x < -k ? -k : x) : k
 //     etc.
 //
+// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is
+// a power of 2.
+//
 // It returns true if the conversion can be done, false otherwise.
-// Additionally, the variable is returned in parameter V and the constant in K.
+// Additionally, the variable is returned in parameter V, the constant in K and
+// usat is set to true if the conditional represents an unsigned saturation
 static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
-                                    uint64_t &K) {
+                                    uint64_t &K, bool &usat) {
   SDValue LHS1 = Op.getOperand(0);
   SDValue RHS1 = Op.getOperand(1);
   SDValue TrueVal1 = Op.getOperand(2);
@@ -4286,13 +4290,23 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
   int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
   int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
   int64_t PosVal = std::max(Val1, Val2);
+  int64_t NegVal = std::min(Val1, Val2);
 
   if (((Val1 > Val2 && UpperCheckOp == &Op) ||
        (Val1 < Val2 && UpperCheckOp == &Op2)) &&
-      Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) {
+      isPowerOf2_64(PosVal + 1)) {
+
+    // Handle the difference between USAT (unsigned) and SSAT (signed) saturation
+    if (Val1 == ~Val2)
+      usat = false;
+    else if (NegVal == 0)
+      usat = true;
+    else
+      return false;
 
     V = V2;
     K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
+
     return true;
   }
 
@@ -4306,10 +4320,16 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   // Try to convert two saturating conditional selects into a single SSAT
   SDValue SatValue;
   uint64_t SatConstant;
+  bool SatUSat;
   if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
-      isSaturatingConditional(Op, SatValue, SatConstant))
-    return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
-                       DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
+      isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) {
+    if (SatUSat)
+      return DAG.getNode(ARMISD::USAT, dl, VT, SatValue,
+                         DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
+    else
+      return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
+                         DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
+  }
 
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -4506,6 +4526,39 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond = Op.getOperand(1);
+  SDValue Dest = Op.getOperand(2);
+  SDLoc dl(Op);
+
+  // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction.
+  unsigned Opc = Cond.getOpcode();
+  if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO ||
+                               Opc == ISD::SSUBO || Opc == ISD::USUBO)) {
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
+      return SDValue();
+
+    // The actual operation with overflow check.
+    SDValue Value, OverflowCmp;
+    SDValue ARMcc;
+    std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
+
+    // Reverse the condition code.
+    ARMCC::CondCodes CondCode =
+        (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
+    CondCode = ARMCC::getOppositeCondition(CondCode);
+    ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+
+    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
+                       OverflowCmp);
+  }
+
+  return SDValue();
+}
+
 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
@@ -4526,6 +4579,33 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
+  // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction.
+  unsigned Opc = LHS.getOpcode();
+  if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+      return SDValue();
+
+    // The actual operation with overflow check.
+    SDValue Value, OverflowCmp;
+    SDValue ARMcc;
+    std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
+
+    if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
+      // Reverse the condition code.
+      ARMCC::CondCodes CondCode =
+          (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
+      CondCode = ARMCC::getOppositeCondition(CondCode);
+      ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
+    }
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+
+    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
+                       OverflowCmp);
+  }
+
   if (LHS.getValueType() == MVT::i32) {
     SDValue ARMcc;
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
@@ -7523,10 +7603,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
-  const char *LibcallName =
-      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
   RTLIB::Libcall LC =
-      (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32;
+      (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
+  const char *LibcallName = getLibcallName(LC);
   CallingConv::ID CC = getLibcallCallingConv(LC);
   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
 
@@ -7782,6 +7861,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
+  case ISD::BRCOND:        return LowerBRCOND(Op, DAG);
   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
   case ISD::VASTART:       return LowerVASTART(Op, DAG);
@@ -13751,7 +13831,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   case AtomicOrdering::SequentiallyConsistent:
     if (!Inst->hasAtomicStore())
       return nullptr; // Nothing to do
-    /*FALLTHROUGH*/
+    LLVM_FALLTHROUGH;
   case AtomicOrdering::Release:
   case AtomicOrdering::AcquireRelease:
     if (Subtarget->preferISHSTBarriers())
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 0a1af8d89f9b..bf63dfae4407 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -87,6 +87,7 @@ class VectorType;
       CMOV,         // ARM conditional move instructions.
 
       SSAT,         // Signed saturation
+      USAT,         // Unsigned saturation
 
       BCC_i64,
 
@@ -643,6 +644,7 @@ class VectorType;
     SDValue LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 4e13af596300..eb8526bfeadf 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -139,6 +139,8 @@ def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
 
 def ARMssatnoshift   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
 
+def ARMusatnoshift   : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>;
+
 def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
                               [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
 
@@ -278,6 +280,9 @@ def HasDSP           : Predicate<"Subtarget->hasDSP()">,
 def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
                                  AssemblerPredicate<"FeatureDB",
                                                     "data-barriers">;
+def HasDFB           : Predicate<"Subtarget->hasFullDataBarrier()">,
+                                 AssemblerPredicate<"FeatureDFB",
+                                                    "full-data-barrier">;
 def HasV7Clrex  : Predicate<"Subtarget->hasV7Clrex()">,
                             AssemblerPredicate<"FeatureV7Clrex",
                                                "v7 clrex">;
@@ -3832,6 +3837,8 @@ def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
                (USAT imm0_31:$pos, GPRnopc:$a, 0)>;
 def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
              (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
+def : ARMPat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm),
+             (USAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
 def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos),
                (SSAT16 imm1_16:$pos, GPRnopc:$a)>;
 def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos),
@@ -5846,6 +5853,8 @@ include "ARMInstrNEON.td"
 def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>;
 def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>;
 def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>;
+// Armv8-R 'Data Full Barrier'
+def : InstAlias<"dfb", (DSB 0xc), 1>, Requires<[IsARM, HasDFB]>;
 
 // System instructions
 def : MnemonicAlias<"swi", "svc">;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 670ed127da7e..4592249f5795 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -2336,6 +2336,8 @@ def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn),
 
 def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
              (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
+def : T2Pat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm),
+             (t2USAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
 def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos),
             (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
 def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos),
@@ -4506,6 +4508,8 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
 def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+// Armv8-R 'Data Full Barrier'
+def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>;
 
 // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
 // width specifier.
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 6bbeae2e1151..b0fd0b476920 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -669,13 +669,22 @@ bool ARMInstructionSelector::select(MachineInstr &I,
     return true;
   }
 
+  using namespace TargetOpcode;
+  if (I.getOpcode() == G_CONSTANT) {
+    // Pointer constants should be treated the same as 32-bit integer constants.
+    // Change the type and let TableGen handle it.
+    unsigned ResultReg = I.getOperand(0).getReg();
+    LLT Ty = MRI.getType(ResultReg);
+    if (Ty.isPointer())
+      MRI.setType(ResultReg, LLT::scalar(32));
+  }
+
   if (selectImpl(I, CoverageInfo))
     return true;
 
   MachineInstrBuilder MIB{MF, I};
   bool isSExt = false;
 
-  using namespace TargetOpcode;
   switch (I.getOpcode()) {
   case G_SEXT:
     isSExt = true;
@@ -741,6 +750,31 @@ bool ARMInstructionSelector::select(MachineInstr &I,
     const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
     const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
 
+    if (SrcRegBank.getID() == ARM::FPRRegBankID) {
+      // This should only happen in the obscure case where we have put a 64-bit
+      // integer into a D register. Get it out of there and keep only the
+      // interesting part.
+      assert(I.getOpcode() == G_TRUNC && "Unsupported operand for G_ANYEXT");
+      assert(DstRegBank.getID() == ARM::GPRRegBankID &&
+             "Unsupported combination of register banks");
+      assert(MRI.getType(SrcReg).getSizeInBits() == 64 && "Unsupported size");
+      assert(MRI.getType(DstReg).getSizeInBits() <= 32 && "Unsupported size");
+
+      unsigned IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass);
+      auto InsertBefore = std::next(I.getIterator());
+      auto MovI =
+          BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::VMOVRRD))
+              .addDef(DstReg)
+              .addDef(IgnoredBits)
+              .addUse(SrcReg)
+              .add(predOps(ARMCC::AL));
+      if (!constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI))
+        return false;
+
+      MIB->eraseFromParent();
+      return true;
+    }
+
     if (SrcRegBank.getID() != DstRegBank.getID()) {
       DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");
       return false;
@@ -754,6 +788,28 @@ bool ARMInstructionSelector::select(MachineInstr &I,
     I.setDesc(TII.get(COPY));
     return selectCopy(I, TII, MRI, TRI, RBI);
   }
+  case G_INTTOPTR:
+  case G_PTRTOINT: {
+    auto SrcReg = I.getOperand(1).getReg();
+    auto DstReg = I.getOperand(0).getReg();
+
+    const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
+    const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+
+    if (SrcRegBank.getID() != DstRegBank.getID()) {
+      DEBUG(dbgs()
+            << "G_INTTOPTR/G_PTRTOINT operands on different register banks\n");
+      return false;
+    }
+
+    if (SrcRegBank.getID() != ARM::GPRRegBankID) {
+      DEBUG(dbgs() << "G_INTTOPTR/G_PTRTOINT on non-GPR not supported yet\n");
+      return false;
+    }
+
+    I.setDesc(TII.get(COPY));
+    return selectCopy(I, TII, MRI, TRI, RBI);
+  }
   case G_SELECT:
     return selectSelect(MIB, MRI);
   case G_ICMP: {
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 2dd1dff64e87..8cff1f0869d0 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -126,6 +126,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
     setAction({Op, s32}, Legal);
   }
 
+  setAction({G_INTTOPTR, p0}, Legal);
+  setAction({G_INTTOPTR, 1, s32}, Legal);
+
+  setAction({G_PTRTOINT, s32}, Legal);
+  setAction({G_PTRTOINT, 1, p0}, Legal);
+
   for (unsigned Op : {G_ASHR, G_LSHR, G_SHL})
     setAction({Op, s32}, Legal);
 
@@ -139,6 +145,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   setAction({G_BRCOND, s1}, Legal);
 
   setAction({G_CONSTANT, s32}, Legal);
+  setAction({G_CONSTANT, p0}, Legal);
   setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16);
 
   setAction({G_ICMP, s1}, Legal);
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index b32bfd449544..fad0e98285e6 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -226,12 +226,30 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case G_SEXT:
   case G_ZEXT:
   case G_ANYEXT:
-  case G_TRUNC:
   case G_GEP:
+  case G_INTTOPTR:
+  case G_PTRTOINT:
     // FIXME: We're abusing the fact that everything lives in a GPR for now; in
     // the real world we would use different mappings.
     OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx];
     break;
+  case G_TRUNC: {
+    // In some cases we may end up with a G_TRUNC from a 64-bit value to a
+    // 32-bit value. This isn't a real floating point trunc (that would be a
+    // G_FPTRUNC). Instead it is an integer trunc in disguise, which can appear
+    // because the legalizer doesn't distinguish between integer and floating
+    // point values so it may leave some 64-bit integers un-narrowed. Until we
+    // have a more principled solution that doesn't let such things sneak all
+    // the way to this point, just map the source to a DPR and the destination
+    // to a GPR.
+    LLT LargeTy = MRI.getType(MI.getOperand(1).getReg());
+    OperandsMapping =
+        LargeTy.getSizeInBits() <= 32
+            ? &ARM::ValueMappings[ARM::GPR3OpsIdx]
+            : getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+                                  &ARM::ValueMappings[ARM::DPR3OpsIdx]});
+    break;
+  }
   case G_LOAD:
   case G_STORE: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 4d4a88126ce6..23027e92481f 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -348,11 +348,6 @@ unsigned ARMSubtarget::getMispredictionPenalty() const {
   return SchedModel.MispredictPenalty;
 }
 
-bool ARMSubtarget::hasSinCos() const {
-  return isTargetWatchOS() ||
-    (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0));
-}
-
 bool ARMSubtarget::enableMachineScheduler() const {
   // Enable the MachineScheduler before register allocation for subtargets
   // with the use-misched feature.
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 9301197e1387..eedb675a3304 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -236,6 +236,10 @@ protected:
   /// instructions.
   bool HasDataBarrier = false;
 
+  /// HasFullDataBarrier - True if the subtarget supports DFB data barrier
+  /// instruction.
+  bool HasFullDataBarrier = false;
+
   /// HasV7Clrex - True if the subtarget supports CLREX instructions
   bool HasV7Clrex = false;
 
@@ -544,6 +548,7 @@ public:
   bool hasDivideInThumbMode() const { return HasHardwareDivideInThumb; }
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
   bool hasDataBarrier() const { return HasDataBarrier; }
+  bool hasFullDataBarrier() const { return HasFullDataBarrier; }
   bool hasV7Clrex() const { return HasV7Clrex; }
   bool hasAcquireRelease() const { return HasAcquireRelease; }
 
@@ -712,10 +717,6 @@ public:
 
   unsigned getMispredictionPenalty() const;
 
-  /// This function returns true if the target has sincos() routine in its
-  /// compiler runtime or math libraries.
-  bool hasSinCos() const;
-
   /// Returns true if machine scheduler should be enabled.
   bool enableMachineScheduler() const override;
 
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 51982b2dab14..0f6d1eddc985 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -92,6 +92,7 @@ extern "C" void LLVMInitializeARMTarget() {
   initializeARMConstantIslandsPass(Registry);
   initializeARMExecutionDepsFixPass(Registry);
   initializeARMExpandPseudoPass(Registry);
+  initializeThumb2SizeReducePass(Registry);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -282,10 +283,9 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
   return I.get();
 }
 
-TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(ARMTTIImpl(this, F));
-  });
+TargetTransformInfo
+ARMBaseTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(ARMTTIImpl(this, F));
 }
 
 ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 655ec3202bfb..2072bb731f0a 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -53,8 +53,7 @@ public:
   const ARMSubtarget *getSubtargetImpl() const = delete;
   bool isLittleEndian() const { return isLittle; }
 
-  /// \brief Get the TargetIRAnalysis for this target.
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index cae01e415eff..43d7888075b5 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -394,25 +394,6 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
   return 1;
 }
 
-int ARMTTIImpl::getFPOpCost(Type *Ty) {
-  // Use similar logic that's in ARMISelLowering:
-  // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access
-  // to VFP.
-
-  if (ST->hasVFP2() && !ST->isThumb1Only()) {
-    if (Ty->isFloatTy()) {
-      return TargetTransformInfo::TCC_Basic;
-    }
-
-    if (Ty->isDoubleTy()) {
-      return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive :
-        TargetTransformInfo::TCC_Basic;
-    }
-  }
-
-  return TargetTransformInfo::TCC_Expensive;
-}
-
 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
   // We only handle costs of reverse and alternate shuffles for now.
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 99353a3219a0..cd9fa0709020 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -156,8 +156,6 @@ public:
   int getAddressComputationCost(Type *Val, ScalarEvolution *SE, 
                                 const SCEV *Ptr);
 
-  int getFPOpCost(Type *Ty);
-
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 26fda5f22b4f..97b642c99f80 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -5581,11 +5581,11 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
     CanAcceptPredicationCode =
         Mnemonic != "cdp2" && Mnemonic != "clrex" && Mnemonic != "mcr2" &&
         Mnemonic != "mcrr2" && Mnemonic != "mrc2" && Mnemonic != "mrrc2" &&
-        Mnemonic != "dmb" && Mnemonic != "dsb" && Mnemonic != "isb" &&
-        Mnemonic != "pld" && Mnemonic != "pli" && Mnemonic != "pldw" &&
-        Mnemonic != "ldc2" && Mnemonic != "ldc2l" && Mnemonic != "stc2" &&
-        Mnemonic != "stc2l" && !Mnemonic.startswith("rfe") &&
-        !Mnemonic.startswith("srs");
+        Mnemonic != "dmb" && Mnemonic != "dfb" && Mnemonic != "dsb" &&
+        Mnemonic != "isb" && Mnemonic != "pld" && Mnemonic != "pli" &&
+        Mnemonic != "pldw" && Mnemonic != "ldc2" && Mnemonic != "ldc2l" &&
+        Mnemonic != "stc2" && Mnemonic != "stc2l" &&
+        !Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs");
   } else if (isThumbOne()) {
     if (hasV6MOps())
       CanAcceptPredicationCode = Mnemonic != "movs";
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index a29a2eeccfe8..53c635877675 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -2386,6 +2386,7 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
     case ARM::VLD4q32_UPD:
       if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
         return MCDisassembler::Fail;
+      break;
     default:
       break;
   }
@@ -3326,6 +3327,7 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
   case ARM::t2STRs:
     if (Rn == 15)
       return MCDisassembler::Fail;
+    break;
   default:
     break;
   }
@@ -3391,6 +3393,7 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
       break;
     case ARM::t2LDRSBs:
       Inst.setOpcode(ARM::t2PLIs);
+      break;
     default:
       break;
     }
@@ -3854,6 +3857,7 @@ static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
   case ARM::t2STRHi12:
     if (Rn == 15)
       return MCDisassembler::Fail;
+    break;
   default:
     break;
   }
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 3920c73fba6a..5357e26856ea 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -45,6 +45,7 @@
 using namespace llvm;
 
 #define DEBUG_TYPE "t2-reduce-size"
+#define THUMB2_SIZE_REDUCE_NAME "Thumb2 instruction size reduce pass"
 
 STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");
 STATISTIC(Num2Addrs,   "Number of 32-bit instrs reduced to 2addr 16-bit ones");
@@ -162,7 +163,7 @@ namespace {
     const Thumb2InstrInfo *TII;
     const ARMSubtarget *STI;
 
-    Thumb2SizeReduce(std::function<bool(const Function &)> Ftor);
+    Thumb2SizeReduce(std::function<bool(const Function &)> Ftor = nullptr);
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -172,7 +173,7 @@ namespace {
     }
 
     StringRef getPassName() const override {
-      return "Thumb2 instruction size reduction pass";
+      return THUMB2_SIZE_REDUCE_NAME;
     }
 
   private:
@@ -237,6 +238,9 @@ namespace {
 
 } // end anonymous namespace
 
+INITIALIZE_PASS(Thumb2SizeReduce, DEBUG_TYPE, THUMB2_SIZE_REDUCE_NAME, false,
+                false)
+
 Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
     : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
   OptimizeSize = MinimizeSize = false;
diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
index 6f81e020b996..1f4ef098403d 100644
--- a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
+++ b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
@@ -56,7 +56,7 @@ void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     O << getRegisterName(Op.getReg());
   } else if (Op.isImm()) {
-    O << (int32_t)Op.getImm();
+    O << formatImm((int32_t)Op.getImm());
   } else {
     assert(Op.isExpr() && "Expected an expression");
     printExpr(Op.getExpr(), O);
@@ -76,9 +76,9 @@ void BPFInstPrinter::printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O,
   if (OffsetOp.isImm()) {
     auto Imm = OffsetOp.getImm();
     if (Imm >= 0)
-      O << " + " << formatDec(Imm);
+      O << " + " << formatImm(Imm);
     else
-      O << " - " << formatDec(-Imm);
+      O << " - " << formatImm(-Imm);
   } else {
     assert(0 && "Expected an immediate");
   }
@@ -88,7 +88,7 @@ void BPFInstPrinter::printImm64Operand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isImm())
-    O << (uint64_t)Op.getImm();
+    O << formatImm(Op.getImm());
   else if (Op.isExpr())
     printExpr(Op.getExpr(), O);
   else
@@ -100,7 +100,7 @@ void BPFInstPrinter::printBrTargetOperand(const MCInst *MI, unsigned OpNo,
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isImm()) {
     int16_t Imm = Op.getImm();
-    O << ((Imm >= 0) ? "+" : "") << Imm;
+    O << ((Imm >= 0) ? "+" : "") << formatImm(Imm);
   } else if (Op.isExpr()) {
     printExpr(Op.getExpr(), O);
   } else {
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 537f97c9a987..8b6c571dee02 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -756,11 +756,11 @@ struct ShuffleMask {
 
   ShuffleMask lo() const {
     size_t H = Mask.size()/2;
-    return ShuffleMask({Mask.data(), H});
+    return ShuffleMask(Mask.take_front(H));
   }
   ShuffleMask hi() const {
     size_t H = Mask.size()/2;
-    return ShuffleMask({Mask.data()+H, H});
+    return ShuffleMask(Mask.take_back(H));
   }
 };
 
@@ -836,15 +836,6 @@ namespace llvm {
   };
 }
 
-// Return a submask of A that is shorter than A by |C| elements:
-// - if C > 0, return a submask of A that starts at position C,
-// - if C <= 0, return a submask of A that starts at 0 (reduce A by |C|).
-static ArrayRef<int> subm(ArrayRef<int> A, int C) {
-  if (C > 0)
-    return { A.data()+C, A.size()-C };
-  return { A.data(), A.size()+C };
-}
-
 static void splitMask(ArrayRef<int> Mask, MutableArrayRef<int> MaskL,
                       MutableArrayRef<int> MaskR) {
   unsigned VecLen = Mask.size();
@@ -910,21 +901,38 @@ bool HvxSelector::selectVectorConstants(SDNode *N) {
   // Since they are generated during the selection process, the main
   // selection algorithm is not aware of them. Select them directly
   // here.
-  if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) {
-    SDValue Addr = cast<LoadSDNode>(N)->getBasePtr();
-    unsigned AddrOpc = Addr.getOpcode();
-    if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP) {
-      if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool) {
-        ISel.Select(N);
-        return true;
-      }
+  SmallVector<SDNode*,4> Loads;
+  SmallVector<SDNode*,16> WorkQ;
+
+  // The DAG can change (due to CSE) during selection, so cache all the
+  // unselected nodes first to avoid traversing a mutating DAG.
+
+  auto IsLoadToSelect = [] (SDNode *N) {
+    if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) {
+      SDValue Addr = cast<LoadSDNode>(N)->getBasePtr();
+      unsigned AddrOpc = Addr.getOpcode();
+      if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP)
+        if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+          return true;
     }
+    return false;
+  };
+
+  WorkQ.push_back(N);
+  for (unsigned i = 0; i != WorkQ.size(); ++i) {
+    SDNode *W = WorkQ[i];
+    if (IsLoadToSelect(W)) {
+      Loads.push_back(W);
+      continue;
+    }
+    for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j)
+      WorkQ.push_back(W->getOperand(j).getNode());
   }
 
-  bool Selected = false;
-  for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
-    Selected = selectVectorConstants(N->getOperand(I).getNode()) || Selected;
-  return Selected;
+  for (SDNode *L : Loads)
+    ISel.Select(L);
+
+  return !Loads.empty();
 }
 
 void HvxSelector::materialize(const ResultStack &Results) {
@@ -1159,8 +1167,8 @@ OpRef HvxSelector::vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
                          ResultStack &Results) {
   DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
   size_t S = Bytes.size() / 2;
-  OpRef L = vmuxs({Bytes.data(),   S}, OpRef::lo(Va), OpRef::lo(Vb), Results);
-  OpRef H = vmuxs({Bytes.data()+S, S}, OpRef::hi(Va), OpRef::hi(Vb), Results);
+  OpRef L = vmuxs(Bytes.take_front(S), OpRef::lo(Va), OpRef::lo(Vb), Results);
+  OpRef H = vmuxs(Bytes.drop_front(S), OpRef::hi(Va), OpRef::hi(Vb), Results);
   return concat(L, H, Results);
 }
 
@@ -1435,7 +1443,7 @@ OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb,
       return OpRef::fail();
     // Examine the rest of the mask.
     for (int I = L; I < N; I += L) {
-      auto S = findStrip(subm(SM.Mask,I), 1, N-I);
+      auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);
       // Check whether the mask element at the beginning of each strip
       // increases by 2L each time.
       if (S.first - Strip.first != 2*I)
@@ -1465,7 +1473,7 @@ OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb,
   std::pair<int,unsigned> PrevS = Strip;
   bool Flip = false;
   for (int I = L; I < N; I += L) {
-    auto S = findStrip(subm(SM.Mask,I), 1, N-I);
+    auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);
     if (S.second != PrevS.second)
       return OpRef::fail();
     int Diff = Flip ? PrevS.first - S.first + 2*L
@@ -1524,7 +1532,7 @@ OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) {
 
   // First, check the non-ignored strips.
   for (int I = 2*L; I < 2*N; I += 2*L) {
-    auto S = findStrip(subm(SM.Mask,I), 1, N-I);
+    auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);
     if (S.second != unsigned(L))
       return OpRef::fail();
     if (2*S.first != I)
@@ -1532,7 +1540,7 @@ OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) {
   }
   // Check the -1s.
   for (int I = L; I < 2*N; I += 2*L) {
-    auto S = findStrip(subm(SM.Mask,I), 0, N-I);
+    auto S = findStrip(SM.Mask.drop_front(I), 0, N-I);
     if (S.first != -1 || S.second != unsigned(L))
       return OpRef::fail();
   }
@@ -1666,8 +1674,8 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
     if (!isPowerOf2_32(X))
       return OpRef::fail();
     // Check the other segments of Mask.
-    for (int J = 0; J < VecLen; J += I) {
-      if (XorPow2(subm(SM.Mask, -J), I) != X)
+    for (int J = I; J < VecLen; J += I) {
+      if (XorPow2(SM.Mask.slice(J, I), I) != X)
         return OpRef::fail();
     }
     Perm[Log2_32(X)] = Log2_32(I)-1;
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 586363335df1..0e0da2ddc400 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -761,11 +761,13 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
       default:
-        // Loc info must be one of Full, SExt, ZExt, or AExt.
+        // Loc info must be one of Full, BCvt, SExt, ZExt, or AExt.
         llvm_unreachable("Unknown loc info!");
-      case CCValAssign::BCvt:
       case CCValAssign::Full:
         break;
+      case CCValAssign::BCvt:
+        Arg = DAG.getBitcast(VA.getLocVT(), Arg);
+        break;
       case CCValAssign::SExt:
         Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
         break;
@@ -1135,6 +1137,8 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
         unsigned VReg =
           RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        if (VA.getLocInfo() == CCValAssign::BCvt)
+          RegVT = VA.getValVT();
         SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
         // Treat values of type MVT::i1 specially: they are passed in
         // registers of type i32, but they need to remain as values of
@@ -1155,6 +1159,8 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
         unsigned VReg =
           RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        if (VA.getLocInfo() == CCValAssign::BCvt)
+          RegVT = VA.getValVT();
         InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
 
       // Single Vector
@@ -1715,8 +1721,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::v4i1,  &Hexagon::PredRegsRegClass);  // ddccbbaa
   addRegisterClass(MVT::v8i1,  &Hexagon::PredRegsRegClass);  // hgfedcba
   addRegisterClass(MVT::i32,   &Hexagon::IntRegsRegClass);
-  addRegisterClass(MVT::v4i8,  &Hexagon::IntRegsRegClass);
   addRegisterClass(MVT::v2i16, &Hexagon::IntRegsRegClass);
+  addRegisterClass(MVT::v4i8,  &Hexagon::IntRegsRegClass);
   addRegisterClass(MVT::i64,   &Hexagon::DoubleRegsRegClass);
   addRegisterClass(MVT::v8i8,  &Hexagon::DoubleRegsRegClass);
   addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass);
@@ -1735,6 +1741,14 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
       addRegisterClass(MVT::v128i8, &Hexagon::HvxWRRegClass);
       addRegisterClass(MVT::v64i16, &Hexagon::HvxWRRegClass);
       addRegisterClass(MVT::v32i32, &Hexagon::HvxWRRegClass);
+      // These "short" boolean vector types should be legal because
+      // they will appear as results of vector compares. If they were
+      // not legal, type legalization would try to make them legal
+      // and that would require using operations that do not use or
+      // produce such types. That, in turn, would imply using custom
+      // nodes, which would be unoptimizable by the DAG combiner.
+      // The idea is to rely on target-independent operations as much
+      // as possible.
       addRegisterClass(MVT::v16i1, &Hexagon::HvxQRRegClass);
       addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
       addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
@@ -1964,9 +1978,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
 
   // Types natively supported:
-  for (MVT NativeVT : {MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v32i1, MVT::v64i1,
-                       MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v1i32,
-                       MVT::v2i32, MVT::v1i64}) {
+  for (MVT NativeVT : {MVT::v32i1, MVT::v64i1, MVT::v4i8, MVT::v8i8, MVT::v2i16,
+                       MVT::v4i16, MVT::v1i32, MVT::v2i32, MVT::v1i64}) {
     setOperationAction(ISD::BUILD_VECTOR,       NativeVT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, NativeVT, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  NativeVT, Custom);
@@ -1992,63 +2005,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     AddPromotedToType(Opc, FromTy, ToTy);
   };
 
-  if (Subtarget.useHVXOps()) {
-    bool Use64b = Subtarget.useHVX64BOps();
-    ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128;
-    ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128;
-    MVT ByteV = Use64b ?  MVT::v64i8 : MVT::v128i8;
-    MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8;
-
-    setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal);
-    setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal);
-    setOperationAction(ISD::AND,            ByteV, Legal);
-    setOperationAction(ISD::OR,             ByteV, Legal);
-    setOperationAction(ISD::XOR,            ByteV, Legal);
-
-    for (MVT T : LegalV) {
-      setIndexedLoadAction(ISD::POST_INC,  T, Legal);
-      setIndexedStoreAction(ISD::POST_INC, T, Legal);
-
-      setOperationAction(ISD::ADD,     T, Legal);
-      setOperationAction(ISD::SUB,     T, Legal);
-      setOperationAction(ISD::VSELECT, T, Legal);
-      if (T != ByteV) {
-        setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
-        setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
-      }
-
-      setOperationAction(ISD::MUL,                T, Custom);
-      setOperationAction(ISD::SETCC,              T, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,       T, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   T, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  T, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR,  T, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
-      if (T != ByteV)
-        setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom);
-    }
-
-    for (MVT T : LegalV) {
-      if (T == ByteV)
-        continue;
-      // Promote all shuffles and concats to operate on vectors of bytes.
-      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV);
-      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV);
-      setPromoteTo(ISD::AND,            T, ByteV);
-      setPromoteTo(ISD::OR,             T, ByteV);
-      setPromoteTo(ISD::XOR,            T, ByteV);
-    }
-
-    for (MVT T : LegalW) {
-      if (T == ByteW)
-        continue;
-      // Promote all shuffles and concats to operate on vectors of bytes.
-      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW);
-      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW);
-    }
-  }
-
   // Subtarget-specific operation actions.
   //
   if (Subtarget.hasV5TOps()) {
@@ -2110,6 +2066,67 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setIndexedStoreAction(ISD::POST_INC, VT, Legal);
   }
 
+  if (Subtarget.useHVXOps()) {
+    bool Use64b = Subtarget.useHVX64BOps();
+    ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128;
+    ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128;
+    MVT ByteV = Use64b ?  MVT::v64i8 : MVT::v128i8;
+    MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8;
+
+    setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal);
+    setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal);
+    setOperationAction(ISD::AND,            ByteV, Legal);
+    setOperationAction(ISD::OR,             ByteV, Legal);
+    setOperationAction(ISD::XOR,            ByteV, Legal);
+
+    for (MVT T : LegalV) {
+      setIndexedLoadAction(ISD::POST_INC,  T, Legal);
+      setIndexedStoreAction(ISD::POST_INC, T, Legal);
+
+      setOperationAction(ISD::ADD,     T, Legal);
+      setOperationAction(ISD::SUB,     T, Legal);
+      if (T != ByteV) {
+        setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal);
+        setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal);
+      }
+
+      setOperationAction(ISD::MUL,                T, Custom);
+      setOperationAction(ISD::SETCC,              T, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       T, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   T, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  T, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR,  T, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
+      if (T != ByteV)
+        setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom);
+    }
+
+    for (MVT T : LegalV) {
+      if (T == ByteV)
+        continue;
+      // Promote all shuffles and concats to operate on vectors of bytes.
+      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV);
+      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV);
+      setPromoteTo(ISD::AND,            T, ByteV);
+      setPromoteTo(ISD::OR,             T, ByteV);
+      setPromoteTo(ISD::XOR,            T, ByteV);
+    }
+
+    for (MVT T : LegalW) {
+      // Custom-lower BUILD_VECTOR for vector pairs. The standard (target-
+      // independent) handling of it would convert it to a load, which is
+      // not always the optimal choice.
+      setOperationAction(ISD::BUILD_VECTOR, T, Custom);
+
+      if (T == ByteW)
+        continue;
+      // Promote all shuffles and concats to operate on vectors of bytes.
+      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW);
+      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW);
+    }
+  }
+
   computeRegisterProperties(&HRI);
 
   //
@@ -2256,6 +2273,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VINSERTW0:     return "HexagonISD::VINSERTW0";
   case HexagonISD::VROR:          return "HexagonISD::VROR";
   case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE";
+  case HexagonISD::VZERO:         return "HexagonISD::VZERO";
   case HexagonISD::OP_END:        break;
   }
   return nullptr;
@@ -2331,14 +2349,27 @@ bool HexagonTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask,
 
 TargetLoweringBase::LegalizeTypeAction
 HexagonTargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (VT.getVectorNumElements() == 1)
+    return TargetLoweringBase::TypeScalarizeVector;
+
+  // Always widen vectors of i1.
+  MVT ElemTy = VT.getSimpleVT().getVectorElementType();
+  if (ElemTy == MVT::i1)
+    return TargetLoweringBase::TypeWidenVector;
+
   if (Subtarget.useHVXOps()) {
     // If the size of VT is at least half of the vector length,
     // widen the vector. Note: the threshold was not selected in
     // any scientific way.
-    if (VT.getSizeInBits() >= Subtarget.getVectorLength()*8/2)
-      return TargetLoweringBase::TypeWidenVector;
+    ArrayRef<MVT> Tys = Subtarget.getHVXElementTypes();
+    if (llvm::find(Tys, ElemTy) != Tys.end()) {
+      unsigned HwWidth = 8*Subtarget.getVectorLength();
+      unsigned VecWidth = VT.getSizeInBits();
+      if (VecWidth >= HwWidth/2 && VecWidth < HwWidth)
+        return TargetLoweringBase::TypeWidenVector;
+    }
   }
-  return TargetLowering::getPreferredVectorAction(VT);
+  return TargetLoweringBase::TypeSplitVector;
 }
 
 // Lower a vector shuffle (V1, V2, V3).  V1 and V2 are the two vectors
@@ -2463,21 +2494,43 @@ HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, dl, VT, Result);
 }
 
+bool
+HexagonTargetLowering::getBuildVectorConstInts(ArrayRef<SDValue> Values,
+      MVT VecTy, SelectionDAG &DAG,
+      MutableArrayRef<ConstantInt*> Consts) const {
+  MVT ElemTy = VecTy.getVectorElementType();
+  unsigned ElemWidth = ElemTy.getSizeInBits();
+  IntegerType *IntTy = IntegerType::get(*DAG.getContext(), ElemWidth);
+  bool AllConst = true;
+
+  for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+    SDValue V = Values[i];
+    if (V.isUndef()) {
+      Consts[i] = ConstantInt::get(IntTy, 0);
+      continue;
+    }
+    if (auto *CN = dyn_cast<ConstantSDNode>(V.getNode())) {
+      const ConstantInt *CI = CN->getConstantIntValue();
+      Consts[i] = const_cast<ConstantInt*>(CI);
+    } else if (auto *CN = dyn_cast<ConstantFPSDNode>(V.getNode())) {
+      const ConstantFP *CF = CN->getConstantFPValue();
+      APInt A = CF->getValueAPF().bitcastToAPInt();
+      Consts[i] = ConstantInt::get(IntTy, A.getZExtValue());
+    } else {
+      AllConst = false;
+    }
+  }
+  return AllConst;
+}
+
 SDValue
 HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
                                      MVT VecTy, SelectionDAG &DAG) const {
   MVT ElemTy = VecTy.getVectorElementType();
   assert(VecTy.getVectorNumElements() == Elem.size());
 
-  SmallVector<ConstantSDNode*,4> Consts;
-  bool AllConst = true;
-  for (SDValue V : Elem) {
-    if (isUndef(V))
-      V = DAG.getConstant(0, dl, ElemTy);
-    auto *C = dyn_cast<ConstantSDNode>(V.getNode());
-    Consts.push_back(C);
-    AllConst = AllConst && C != nullptr;
-  }
+  SmallVector<ConstantInt*,4> Consts(Elem.size());
+  bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts);
 
   unsigned First, Num = Elem.size();
   for (First = 0; First != Num; ++First)
@@ -2486,6 +2539,10 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
   if (First == Num)
     return DAG.getUNDEF(VecTy);
 
+  if (AllConst &&
+      llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); }))
+    return getZero(dl, VecTy, DAG);
+
   if (ElemTy == MVT::i16) {
     assert(Elem.size() == 2);
     if (AllConst) {
@@ -2498,45 +2555,55 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
     return DAG.getBitcast(MVT::v2i16, N);
   }
 
-  // First try generating a constant.
-  assert(ElemTy == MVT::i8 && Num == 4);
-  if (AllConst) {
-    int32_t V = (Consts[0]->getZExtValue() & 0xFF) |
-                (Consts[1]->getZExtValue() & 0xFF) << 8 |
-                (Consts[1]->getZExtValue() & 0xFF) << 16 |
-                Consts[2]->getZExtValue() << 24;
-    return DAG.getBitcast(MVT::v4i8, DAG.getConstant(V, dl, MVT::i32));
-  }
+  if (ElemTy == MVT::i8) {
+    // First try generating a constant.
+    if (AllConst) {
+      int32_t V = (Consts[0]->getZExtValue() & 0xFF) |
+                  (Consts[1]->getZExtValue() & 0xFF) << 8 |
+                  (Consts[1]->getZExtValue() & 0xFF) << 16 |
+                  Consts[2]->getZExtValue() << 24;
+      return DAG.getBitcast(MVT::v4i8, DAG.getConstant(V, dl, MVT::i32));
+    }
 
-  // Then try splat.
-  bool IsSplat = true;
-  for (unsigned i = 0; i != Num; ++i) {
-    if (i == First)
-      continue;
-    if (Elem[i] == Elem[First] || isUndef(Elem[i]))
-      continue;
-    IsSplat = false;
-    break;
-  }
-  if (IsSplat)
-    return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Elem[First]);
+    // Then try splat.
+    bool IsSplat = true;
+    for (unsigned i = 0; i != Num; ++i) {
+      if (i == First)
+        continue;
+      if (Elem[i] == Elem[First] || isUndef(Elem[i]))
+        continue;
+      IsSplat = false;
+      break;
+    }
+    if (IsSplat) {
+      // Legalize the operand to VSPLAT.
+      SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32);
+      return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext);
+    }
 
-  // Generate
-  //   (zxtb(Elem[0]) | (zxtb(Elem[1]) << 8)) |
-  //   (zxtb(Elem[2]) | (zxtb(Elem[3]) << 8)) << 16
-  SDValue S8 = DAG.getConstant(8, dl, MVT::i32);
-  SDValue V0 = DAG.getZeroExtendInReg(Elem[0], dl, MVT::i8);
-  SDValue V1 = DAG.getZeroExtendInReg(Elem[1], dl, MVT::i8);
-  SDValue V2 = DAG.getZeroExtendInReg(Elem[2], dl, MVT::i8);
-  SDValue V3 = DAG.getZeroExtendInReg(Elem[3], dl, MVT::i8);
+    // Generate
+    //   (zxtb(Elem[0]) | (zxtb(Elem[1]) << 8)) |
+    //   (zxtb(Elem[2]) | (zxtb(Elem[3]) << 8)) << 16
+    assert(Elem.size() == 4);
+    SDValue Vs[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      Vs[i] = DAG.getZExtOrTrunc(Elem[i], dl, MVT::i32);
+      Vs[i] = DAG.getZeroExtendInReg(Vs[i], dl, MVT::i8);
+    }
+    SDValue S8 = DAG.getConstant(8, dl, MVT::i32);
+    SDValue T0 = DAG.getNode(ISD::SHL, dl, MVT::i32, {Vs[1], S8});
+    SDValue T1 = DAG.getNode(ISD::SHL, dl, MVT::i32, {Vs[3], S8});
+    SDValue B0 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[0], T0});
+    SDValue B1 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[2], T1});
 
-  SDValue V4 = DAG.getNode(ISD::SHL, dl, MVT::i32, {V1, S8});
-  SDValue V5 = DAG.getNode(ISD::SHL, dl, MVT::i32, {V3, S8});
-  SDValue V6 = DAG.getNode(ISD::OR, dl, MVT::i32, {V0, V4});
-  SDValue V7 = DAG.getNode(ISD::OR, dl, MVT::i32, {V2, V5});
+    SDValue R = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {B1, B0}, DAG);
+    return DAG.getBitcast(MVT::v4i8, R);
+  }
 
-  SDValue T0 = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {V7, V6}, DAG);
-  return DAG.getBitcast(MVT::v4i8, T0);
+#ifndef NDEBUG
+  dbgs() << "VecTy: " << EVT(VecTy).getEVTString() << '\n';
+#endif
+  llvm_unreachable("Unexpected vector element type");
 }
 
 SDValue
@@ -2545,15 +2612,8 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
   MVT ElemTy = VecTy.getVectorElementType();
   assert(VecTy.getVectorNumElements() == Elem.size());
 
-  SmallVector<ConstantSDNode*,8> Consts;
-  bool AllConst = true;
-  for (SDValue V : Elem) {
-    if (isUndef(V))
-      V = DAG.getConstant(0, dl, ElemTy);
-    auto *C = dyn_cast<ConstantSDNode>(V.getNode());
-    Consts.push_back(C);
-    AllConst = AllConst && C != nullptr;
-  }
+  SmallVector<ConstantInt*,8> Consts(Elem.size());
+  bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts);
 
   unsigned First, Num = Elem.size();
   for (First = 0; First != Num; ++First)
@@ -2562,6 +2622,10 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
   if (First == Num)
     return DAG.getUNDEF(VecTy);
 
+  if (AllConst &&
+      llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); }))
+    return getZero(dl, VecTy, DAG);
+
   // First try splat if possible.
   if (ElemTy == MVT::i16) {
     bool IsSplat = true;
@@ -2573,8 +2637,11 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
       IsSplat = false;
       break;
     }
-    if (IsSplat)
-      return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Elem[First]);
+    if (IsSplat) {
+      // Legalize the operand to VSPLAT.
+      SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32);
+      return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext);
+    }
   }
 
   // Then try constant.
@@ -2593,10 +2660,10 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl,
   MVT HalfTy = MVT::getVectorVT(ElemTy, Num/2);
   SDValue L = (ElemTy == MVT::i32)
                 ? Elem[0]
-                : buildVector32({Elem.data(), Num/2}, dl, HalfTy, DAG);
+                : buildVector32(Elem.take_front(Num/2), dl, HalfTy, DAG);
   SDValue H = (ElemTy == MVT::i32)
                 ? Elem[1]
-                : buildVector32({Elem.data()+Num/2, Num/2}, dl, HalfTy, DAG);
+                : buildVector32(Elem.drop_front(Num/2), dl, HalfTy, DAG);
   return DAG.getNode(HexagonISD::COMBINE, dl, VecTy, {H, L});
 }
 
@@ -2696,21 +2763,41 @@ HexagonTargetLowering::insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
 }
 
 SDValue
+HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
+      const {
+  if (Ty.isVector()) {
+    assert(Ty.isInteger() && "Only integer vectors are supported here");
+    unsigned W = Ty.getSizeInBits();
+    if (W <= 64)
+      return DAG.getBitcast(Ty, DAG.getConstant(0, dl, MVT::getIntegerVT(W)));
+    return DAG.getNode(HexagonISD::VZERO, dl, Ty);
+  }
+
+  if (Ty.isInteger())
+    return DAG.getConstant(0, dl, Ty);
+  if (Ty.isFloatingPoint())
+    return DAG.getConstantFP(0.0, dl, Ty);
+  llvm_unreachable("Invalid type for zero");
+}
+
+SDValue
 HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   MVT VecTy = ty(Op);
   unsigned BW = VecTy.getSizeInBits();
+
+  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy, true))
+    return LowerHvxBuildVector(Op, DAG);
+
   if (BW == 32 || BW == 64) {
+    const SDLoc &dl(Op);
     SmallVector<SDValue,8> Ops;
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i)
       Ops.push_back(Op.getOperand(i));
     if (BW == 32)
-      return buildVector32(Ops, SDLoc(Op), VecTy, DAG);
-    return buildVector64(Ops, SDLoc(Op), VecTy, DAG);
+      return buildVector32(Ops, dl, VecTy, DAG);
+    return buildVector64(Ops, dl, VecTy, DAG);
   }
 
-  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
-    return LowerHvxBuildVector(Op, DAG);
-
   return SDValue();
 }
 
@@ -2822,7 +2909,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 #ifndef NDEBUG
       Op.getNode()->dumpr(&DAG);
       if (Opc > HexagonISD::OP_BEGIN && Opc < HexagonISD::OP_END)
-        errs() << "Check for a non-legal type in this operation\n";
+        errs() << "Error: check for a non-legal type in this operation\n";
 #endif
       llvm_unreachable("Should not custom lower this!");
     case ISD::CONCAT_VECTORS:       return LowerCONCAT_VECTORS(Op, DAG);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 0619e2e4e7f9..732834b464b4 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -70,6 +70,7 @@ namespace HexagonISD {
       EH_RETURN,
       DCFETCH,
       READCYCLE,
+      VZERO,
 
       OP_END
     };
@@ -283,6 +284,9 @@ namespace HexagonISD {
     }
 
   private:
+    bool getBuildVectorConstInts(ArrayRef<SDValue> Values, MVT VecTy,
+                                 SelectionDAG &DAG,
+                                 MutableArrayRef<ConstantInt*> Consts) const;
     SDValue buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
                           SelectionDAG &DAG) const;
     SDValue buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
@@ -301,6 +305,7 @@ namespace HexagonISD {
       SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops);
       return SDValue(N, 0);
     }
+    SDValue getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) const;
 
     using VectorPair = std::pair<SDValue, SDValue>;
     using TypePair = std::pair<MVT, MVT>;
@@ -344,6 +349,13 @@ namespace HexagonISD {
     SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1,
                            ArrayRef<int> Mask, SelectionDAG &DAG) const;
 
+    MVT getVecBoolVT() const;
+
+    SDValue buildHvxVectorSingle(ArrayRef<SDValue> Values, const SDLoc &dl,
+                                 MVT VecTy, SelectionDAG &DAG) const;
+    SDValue buildHvxVectorPred(ArrayRef<SDValue> Values, const SDLoc &dl,
+                               MVT VecTy, SelectionDAG &DAG) const;
+
     SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index c1d44cb0e7de..51480d09d734 100644
--- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -141,49 +141,50 @@ HexagonTargetLowering::getByteShuffle(const SDLoc &dl, SDValue Op0,
                               opCastElem(Op1, MVT::i8, DAG), ByteMask);
 }
 
+MVT
+HexagonTargetLowering::getVecBoolVT() const {
+  return MVT::getVectorVT(MVT::i1, 8*Subtarget.getVectorLength());
+}
+
 SDValue
-HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
-      const {
-  const SDLoc &dl(Op);
-  BuildVectorSDNode *BN = cast<BuildVectorSDNode>(Op.getNode());
-  bool IsConst = BN->isConstant();
+HexagonTargetLowering::buildHvxVectorSingle(ArrayRef<SDValue> Values,
+                                            const SDLoc &dl, MVT VecTy,
+                                            SelectionDAG &DAG) const {
+  unsigned VecLen = Values.size();
   MachineFunction &MF = DAG.getMachineFunction();
-  MVT VecTy = ty(Op);
+  MVT ElemTy = VecTy.getVectorElementType();
+  unsigned ElemWidth = ElemTy.getSizeInBits();
+  unsigned HwLen = Subtarget.getVectorLength();
 
-  if (IsConst) {
-    SmallVector<Constant*, 128> Elems;
-    for (SDValue V : BN->op_values()) {
-      if (auto *C = dyn_cast<ConstantSDNode>(V.getNode()))
-        Elems.push_back(const_cast<ConstantInt*>(C->getConstantIntValue()));
-    }
-    Constant *CV = ConstantVector::get(Elems);
-    unsigned Align = VecTy.getSizeInBits() / 8;
+  SmallVector<ConstantInt*, 128> Consts(VecLen);
+  bool AllConst = getBuildVectorConstInts(Values, VecTy, DAG, Consts);
+  if (AllConst) {
+    if (llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); }))
+      return getZero(dl, VecTy, DAG);
+
+    ArrayRef<Constant*> Tmp((Constant**)Consts.begin(),
+                            (Constant**)Consts.end());
+    Constant *CV = ConstantVector::get(Tmp);
+    unsigned Align = HwLen;
     SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, VecTy, Align), DAG);
     return DAG.getLoad(VecTy, dl, DAG.getEntryNode(), CP,
                        MachinePointerInfo::getConstantPool(MF), Align);
   }
 
-  unsigned NumOps = Op.getNumOperands();
-  unsigned HwLen = Subtarget.getVectorLength();
-  unsigned ElemSize = VecTy.getVectorElementType().getSizeInBits() / 8;
-  assert(ElemSize*NumOps == HwLen);
-
+  unsigned ElemSize = ElemWidth / 8;
+  assert(ElemSize*VecLen == HwLen);
   SmallVector<SDValue,32> Words;
-  SmallVector<SDValue,32> Ops;
-  for (unsigned i = 0; i != NumOps; ++i)
-    Ops.push_back(Op.getOperand(i));
 
   if (VecTy.getVectorElementType() != MVT::i32) {
-    assert(ElemSize < 4 && "vNi64 should have been promoted to vNi32");
     assert((ElemSize == 1 || ElemSize == 2) && "Invalid element size");
     unsigned OpsPerWord = (ElemSize == 1) ? 4 : 2;
     MVT PartVT = MVT::getVectorVT(VecTy.getVectorElementType(), OpsPerWord);
-    for (unsigned i = 0; i != NumOps; i += OpsPerWord) {
-      SDValue W = buildVector32({&Ops[i], OpsPerWord}, dl, PartVT, DAG);
+    for (unsigned i = 0; i != VecLen; i += OpsPerWord) {
+      SDValue W = buildVector32(Values.slice(i, OpsPerWord), dl, PartVT, DAG);
       Words.push_back(DAG.getBitcast(MVT::i32, W));
     }
   } else {
-    Words.assign(Ops.begin(), Ops.end());
+    Words.assign(Values.begin(), Values.end());
   }
 
   // Construct two halves in parallel, then or them together.
@@ -208,6 +209,83 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
 }
 
 SDValue
+HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values,
+                                          const SDLoc &dl, MVT VecTy,
+                                          SelectionDAG &DAG) const {
+  // Construct a vector V of bytes, such that a comparison V >u 0 would
+  // produce the required vector predicate.
+  unsigned VecLen = Values.size();
+  unsigned HwLen = Subtarget.getVectorLength();
+  assert(VecLen <= HwLen || VecLen == 8*HwLen);
+  SmallVector<SDValue,128> Bytes;
+
+  if (VecLen <= HwLen) {
+    // In the hardware, each bit of a vector predicate corresponds to a byte
+    // of a vector register. Calculate how many bytes does a bit of VecTy
+    // correspond to.
+    assert(HwLen % VecLen == 0);
+    unsigned BitBytes = HwLen / VecLen;
+    for (SDValue V : Values) {
+      SDValue Ext = !V.isUndef() ? DAG.getZExtOrTrunc(V, dl, MVT::i8)
+                                 : DAG.getConstant(0, dl, MVT::i8);
+      for (unsigned B = 0; B != BitBytes; ++B)
+        Bytes.push_back(Ext);
+    }
+  } else {
+    // There are as many i1 values, as there are bits in a vector register.
+    // Divide the values into groups of 8 and check that each group consists
+    // of the same value (ignoring undefs).
+    for (unsigned I = 0; I != VecLen; I += 8) {
+      unsigned B = 0;
+      // Find the first non-undef value in this group.
+      for (; B != 8; ++B) {
+        if (!Values[I+B].isUndef())
+          break;
+      }
+      SDValue F = Values[I+B];
+      SDValue Ext = (B < 8) ? DAG.getZExtOrTrunc(F, dl, MVT::i8)
+                            : DAG.getConstant(0, dl, MVT::i8);
+      Bytes.push_back(Ext);
+      // Verify that the rest of values in the group are the same as the
+      // first.
+      for (; B != 8; ++B)
+        assert(Values[I+B].isUndef() || Values[I+B] == F);
+    }
+  }
+
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  SDValue ByteVec = buildHvxVectorSingle(Bytes, dl, ByteTy, DAG);
+  SDValue Cmp = DAG.getSetCC(dl, VecTy, ByteVec, getZero(dl, ByteTy, DAG),
+                             ISD::SETUGT);
+  return Cmp;
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
+      const {
+  const SDLoc &dl(Op);
+  MVT VecTy = ty(Op);
+
+  unsigned Size = Op.getNumOperands();
+  SmallVector<SDValue,128> Ops;
+  for (unsigned i = 0; i != Size; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  if (VecTy.getVectorElementType() == MVT::i1)
+    return buildHvxVectorPred(Ops, dl, VecTy, DAG);
+
+  if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
+    ArrayRef<SDValue> A(Ops);
+    MVT SingleTy = typeSplit(VecTy).first;
+    SDValue V0 = buildHvxVectorSingle(A.take_front(Size/2), dl, SingleTy, DAG);
+    SDValue V1 = buildHvxVectorSingle(A.drop_front(Size/2), dl, SingleTy, DAG);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1);
+  }
+
+  return buildHvxVectorSingle(Ops, dl, VecTy, DAG);
+}
+
+SDValue
 HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG)
       const {
   // Change the type of the extracted element to i32.
@@ -399,6 +477,10 @@ HexagonTargetLowering::LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const {
   //   (negate (swap-op NewCmp)),
   // the condition code for the NewCmp should be calculated from the original
   // CC by applying these operations in the reverse order.
+  //
+  // This could also be done through setCondCodeAction, but for negation it
+  // uses a xor with a vector of -1s, which it obtains from BUILD_VECTOR.
+  // That is far too expensive for what can be done with a single instruction.
 
   switch (CC) {
     case ISD::SETNE:    // !eq
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index e2120d3de2ef..cdc2085986a5 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -2899,6 +2899,8 @@ def HexagonREADCYCLE: SDNode<"HexagonISD::READCYCLE", SDTInt64Leaf,
 def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>;
 
 
+def SDTVecLeaf: SDTypeProfile<1, 0, [SDTCisVec<0>]>;
+
 def SDTHexagonVEXTRACTW: SDTypeProfile<1, 2,
   [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>;
 def HexagonVEXTRACTW : SDNode<"HexagonISD::VEXTRACTW", SDTHexagonVEXTRACTW>;
@@ -2920,7 +2922,14 @@ let Predicates = [UseHVX] in {
   def: OpR_RR_pat<V6_vpackoh,  pf2<HexagonVPACKO>,    VecI16, HVI16>;
 }
 
+def HexagonVZERO: SDNode<"HexagonISD::VZERO", SDTVecLeaf>;
+def vzero: PatFrag<(ops), (HexagonVZERO)>;
+
 let Predicates = [UseHVX] in {
+  def: Pat<(VecI8  vzero), (V6_vd0)>;
+  def: Pat<(VecI16 vzero), (V6_vd0)>;
+  def: Pat<(VecI32 vzero), (V6_vd0)>;
+
   def: Pat<(VecPI8 (concat_vectors HVI8:$Vs, HVI8:$Vt)),
            (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
   def: Pat<(VecPI16 (concat_vectors HVI16:$Vs, HVI16:$Vt)),
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index 2ceed70c2497..1d1e85e7ac7e 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -242,7 +242,7 @@ def VecQ32
 // FIXME: the register order should be defined in terms of the preferred
 // allocation order...
 //
-def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
+def IntRegs : RegisterClass<"Hexagon", [i32, f32, v32i1, v4i8, v2i16], 32,
   (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28),
        R10, R11, R29, R30, R31)>;
 
@@ -254,7 +254,8 @@ def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32,
 def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
   (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
 
-def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
+def DoubleRegs : RegisterClass<"Hexagon",
+  [i64, f64, v64i1, v8i8, v4i16, v2i32], 64,
   (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>;
 
 def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64,
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 678ef210d0ae..af93f20d97fc 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -204,14 +204,38 @@ public:
     llvm_unreachable("Invalid HVX vector length settings");
   }
 
-  bool isHVXVectorType(MVT VecTy) const {
+  ArrayRef<MVT> getHVXElementTypes() const {
+    static MVT Types[] = { MVT::i8, MVT::i16, MVT::i32 };
+    return makeArrayRef(Types);
+  }
+
+  bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const {
     if (!VecTy.isVector() || !useHVXOps())
       return false;
-    unsigned ElemWidth = VecTy.getVectorElementType().getSizeInBits();
-    if (ElemWidth < 8 || ElemWidth > 64)
+    MVT ElemTy = VecTy.getVectorElementType();
+    if (!IncludeBool && ElemTy == MVT::i1)
+      return false;
+
+    unsigned HwLen = getVectorLength();
+    unsigned NumElems = VecTy.getVectorNumElements();
+    ArrayRef<MVT> ElemTypes = getHVXElementTypes();
+
+    if (IncludeBool && ElemTy == MVT::i1) {
+      // Special case for the v512i1, etc.
+      if (8*HwLen == NumElems)
+        return true;
+      // Boolean HVX vector types are formed from regular HVX vector types
+      // by replacing the element type with i1.
+      for (MVT T : ElemTypes)
+        if (NumElems * T.getSizeInBits() == 8*HwLen)
+          return true;
       return false;
+    }
+
     unsigned VecWidth = VecTy.getSizeInBits();
-    return VecWidth == 8*getVectorLength() || VecWidth == 16*getVectorLength();
+    if (VecWidth != 8*HwLen && VecWidth != 16*HwLen)
+      return false;
+    return llvm::any_of(ElemTypes, [ElemTy] (MVT T) { return ElemTy == T; });
   }
 
   unsigned getL1CacheLineSize() const;
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 0c40a7b8f382..363b703fef28 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -258,10 +258,9 @@ void HexagonTargetMachine::adjustPassManager(PassManagerBuilder &PMB) {
     });
 }
 
-TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(HexagonTTIImpl(this, F));
-  });
+TargetTransformInfo
+HexagonTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(HexagonTTIImpl(this, F));
 }
 
 
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index acd41f920b53..a7c6a3437fbc 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -39,7 +39,7 @@ public:
 
   void adjustPassManager(PassManagerBuilder &PMB) override;
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   HexagonTargetObjectFile *getObjFileLowering() const override {
     return static_cast<HexagonTargetObjectFile*>(TLOF.get());
diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp
index 9a73c95d6516..2c21a53b13bb 100644
--- a/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -74,10 +74,9 @@ LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-TargetIRAnalysis LanaiTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(LanaiTTIImpl(this, F));
-  });
+TargetTransformInfo
+LanaiTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(LanaiTTIImpl(this, F));
 }
 
 namespace {
diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h
index 2fb1a0536104..0db286ec13e7 100644
--- a/lib/Target/Lanai/LanaiTargetMachine.h
+++ b/lib/Target/Lanai/LanaiTargetMachine.h
@@ -42,7 +42,7 @@ public:
     return &Subtarget;
   }
 
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &pass_manager) override;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index ac81e6207456..2f6dd0035de3 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -188,7 +188,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   // so we have to special check for them.
   unsigned Opcode = TmpInst.getOpcode();
   if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) &&
-      (Opcode != Mips::SLL_MM) && !Binary)
+      (Opcode != Mips::SLL_MM) && (Opcode != Mips::SLL_MMR6) && !Binary)
     llvm_unreachable("unimplemented opcode in encodeInstruction()");
 
   int NewOpcode = -1;
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 50537bed8ff0..c85ee20273c0 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -38,7 +38,7 @@ class MipsRegWithSubRegs<bits<16> Enc, string n, list<Register> subregs>
   let Namespace = "Mips";
 }
 
-// Mips CPU Registers
+// Mips CPU Registers.
 class MipsGPRReg<bits<16> Enc, string n> : MipsReg<Enc, n>;
 
 // Mips 64-bit CPU Registers
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 85193bffef56..fb79a4bf40c5 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -259,17 +259,16 @@ void MipsPassConfig::addPreRegAlloc() {
   addPass(createMipsOptimizePICCallPass());
 }
 
-TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    if (Subtarget->allowMixed16_32()) {
-      DEBUG(errs() << "No Target Transform Info Pass Added\n");
-      // FIXME: This is no longer necessary as the TTI returned is per-function.
-      return TargetTransformInfo(F.getParent()->getDataLayout());
-    }
-
-    DEBUG(errs() << "Target Transform Info Pass Added\n");
-    return TargetTransformInfo(BasicTTIImpl(this, F));
-  });
+TargetTransformInfo
+MipsTargetMachine::getTargetTransformInfo(const Function &F) {
+  if (Subtarget->allowMixed16_32()) {
+    DEBUG(errs() << "No Target Transform Info Pass Added\n");
+    // FIXME: This is no longer necessary as the TTI returned is per-function.
+    return TargetTransformInfo(F.getParent()->getDataLayout());
+  }
+
+  DEBUG(errs() << "Target Transform Info Pass Added\n");
+  return TargetTransformInfo(BasicTTIImpl(this, F));
 }
 
 // Implemented by targets that want to run passes immediately before
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index ccfc9a938d9c..56e6e5d8daa2 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -44,7 +44,7 @@ public:
                     CodeGenOpt::Level OL, bool JIT, bool isLittle);
   ~MipsTargetMachine() override;
 
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   const MipsSubtarget *getSubtargetImpl() const {
     if (Subtarget)
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 85f757878f94..d31e1cb5047b 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -180,10 +180,9 @@ void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
     });
 }
 
-TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(NVPTXTTIImpl(this, F));
-  });
+TargetTransformInfo
+NVPTXTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(NVPTXTTIImpl(this, F));
 }
 
 void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index 54a72a688ee3..eeebf64d39c3 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -63,7 +63,7 @@ public:
 
   void adjustPassManager(PassManagerBuilder &) override;
 
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   bool isMachineVerifierClean() const override {
     return false;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index c870a2256691..7902da20a010 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1531,11 +1531,11 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
 
 void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
-  DebugLoc dl;
 
-  if (MBBI != MBB.end())
-    dl = MBBI->getDebugLoc();
+  // If we got this far a first terminator should exist.
+  assert(MBBI != MBB.end() && "Failed to find the first terminator.");
 
+  DebugLoc dl = MBBI->getDebugLoc();
   const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
 
   // Create branch instruction for pseudo tail call return instruction
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 18e567fa589c..cea59de3e8a9 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11882,6 +11882,12 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
   SDLoc dl(N);
   SDValue Op(N, 0);
 
+  // Don't handle ppc_fp128 here or i1 conversions.
+  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+    return SDValue();
+  if (Op.getOperand(0).getValueType() == MVT::i1)
+    return SDValue();
+
   SDValue FirstOperand(Op.getOperand(0));
   bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
     (FirstOperand.getValueType() == MVT::i8 ||
@@ -11910,11 +11916,6 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
       return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
   }
 
-  // Don't handle ppc_fp128 here or i1 conversions.
-  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
-    return SDValue();
-  if (Op.getOperand(0).getValueType() == MVT::i1)
-    return SDValue();
 
   // For i32 intermediate values, unfortunately, the conversion functions
   // leave the upper 32 bits of the value are undefined. Within the set of
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
index a2640727f813..474661aaaee8 100644
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -1025,9 +1025,6 @@ bool PPCMIPeephole::eliminateRedundantTOCSaves(
 //   bge    0, .LBB0_4
 
 bool PPCMIPeephole::eliminateRedundantCompare(void) {
-  // FIXME: this transformation is causing miscompiles. Disabling it for now
-  // until we can resolve the issue.
-  return false;
   bool Simplified = false;
 
   for (MachineBasicBlock &MBB2 : *MF) {
@@ -1087,10 +1084,21 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) {
       // we replace it with a signed comparison if the comparison
       // to be merged is a signed comparison.
       // In other cases of opcode mismatch, we cannot optimize this.
-      if (isEqOrNe(BI2) &&
+
+      // We cannot change opcode when comparing against an immediate
+      // if the most significant bit of the immediate is one
+      // due to the difference in sign extension.
+      auto CmpAgainstImmWithSignBit = [](MachineInstr *I) {
+        if (!I->getOperand(2).isImm())
+          return false;
+        int16_t Imm = (int16_t)I->getOperand(2).getImm();
+        return Imm < 0;
+      };
+
+      if (isEqOrNe(BI2) && !CmpAgainstImmWithSignBit(CMPI2) &&
           CMPI1->getOpcode() == getSignedCmpOpCode(CMPI2->getOpcode()))
         NewOpCode = CMPI1->getOpcode();
-      else if (isEqOrNe(BI1) &&
+      else if (isEqOrNe(BI1) && !CmpAgainstImmWithSignBit(CMPI1) &&
                getSignedCmpOpCode(CMPI1->getOpcode()) == CMPI2->getOpcode())
         NewOpCode = CMPI2->getOpcode();
       else continue;
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 491f25ca2c64..20a83c973026 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -451,8 +451,7 @@ void PPCPassConfig::addPreEmitPass() {
   addPass(createPPCBranchSelectionPass(), false);
 }
 
-TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(PPCTTIImpl(this, F));
-  });
+TargetTransformInfo
+PPCTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(PPCTTIImpl(this, F));
 }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 102bf7ca59c2..75b98a815ab4 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -49,7 +49,7 @@ public:
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index e74d68182949..3a167a6d452a 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -257,8 +257,7 @@ TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new SystemZPassConfig(*this, PM);
 }
 
-TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(SystemZTTIImpl(this, F));
-  });
+TargetTransformInfo
+SystemZTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(SystemZTTIImpl(this, F));
 }
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 95ad5e339e0b..52bf8bba55de 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -44,7 +44,7 @@ public:
 
   // Override LLVMTargetMachine
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index ad63c7a9cb30..c4c0dd22ee0c 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -219,10 +219,8 @@ CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; }
 
 void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; }
 
-TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([](const Function &F) {
-    return TargetTransformInfo(F.getParent()->getDataLayout());
-  });
+TargetTransformInfo TargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(F.getParent()->getDataLayout());
 }
 
 void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
@@ -244,3 +242,10 @@ MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV) const {
   getNameWithPrefix(NameStr, GV, TLOF->getMangler());
   return TLOF->getContext().getOrCreateSymbol(NameStr);
 }
+
+TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
+  // Since Analysis can't depend on Target, use a std::function to invert the
+  // dependency.
+  return TargetIRAnalysis(
+      [this](const Function &F) { return this->getTargetTransformInfo(F); });
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 2bdba96ab674..a4bb967f36f6 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -746,6 +746,14 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
   LiveIntervals &LIS = getAnalysis<LiveIntervals>();
 
+  // Disable the TEE optimization if we aren't doing direct wasm object
+  // emission, because lowering TEE to TEE_LOCAL is done in the ExplicitLocals
+  // pass, which is also disabled.
+  bool UseTee = true;
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF())
+    UseTee = false;
+
   // Walk the instructions from the bottom up. Currently we don't look past
   // block boundaries, and the blocks aren't ordered so the block visitation
   // order isn't significant, but we may want to change this in the future.
@@ -811,7 +819,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
           Insert =
               RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
                                     LIS, MFI, MRI, TII, TRI);
-        } else if (CanMove &&
+        } else if (UseTee && CanMove &&
                    OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
           Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
                                          MRI, TII);
diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 2599064334ee..f808c063d7e4 100644
--- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -223,6 +223,8 @@ RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = {
 /* SINCOS_F80 */ unsupported,
 /* SINCOS_F128 */ func_i64_i64_iPTR_iPTR,
 /* SINCOS_PPCF128 */ unsupported,
+/* SINCOS_STRET_F32 */ unsupported,
+/* SINCOS_STRET_F64 */ unsupported,
 /* POW_F32 */ f32_func_f32_f32,
 /* POW_F64 */ f64_func_f64_f64,
 /* POW_F80 */ unsupported,
@@ -390,8 +392,9 @@ RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = {
 
 // MEMORY
 /* MEMCPY */ iPTR_func_iPTR_iPTR_iPTR,
-/* MEMSET */ iPTR_func_iPTR_i32_iPTR,
 /* MEMMOVE */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMSET */ iPTR_func_iPTR_i32_iPTR,
+/* BZERO */ unsupported,
 
 // ELEMENT-WISE ATOMIC MEMORY
 /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported,
@@ -687,6 +690,8 @@ RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = {
 /* SINCOS_F80 */ nullptr,
 /* SINCOS_F128 */ "sincosl",
 /* SINCOS_PPCF128 */ nullptr,
+/* SINCOS_STRET_F32 */ nullptr,
+/* SINCOS_STRET_F64 */ nullptr,
 /* POW_F32 */ "powf",
 /* POW_F64 */ "pow",
 /* POW_F80 */ nullptr,
@@ -850,6 +855,7 @@ RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = {
 /* MEMCPY */ "memcpy",
 /* MEMMOVE */ "memset",
 /* MEMSET */ "memmove",
+/* BZERO */ nullptr,
 /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,
 /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,
 /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr,
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 3cc19ef5fbab..d38cde74d2ec 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -146,10 +146,9 @@ public:
 };
 } // end anonymous namespace
 
-TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(WebAssemblyTTIImpl(this, F));
-  });
+TargetTransformInfo
+WebAssemblyTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(WebAssemblyTTIImpl(this, F));
 }
 
 TargetPassConfig *
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
index 224849526514..dd826befd117 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -43,8 +43,7 @@ public:
     return TLOF.get();
   }
 
-  /// \brief Get the TargetIRAnalysis for this target.
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   bool usesPhysRegsForPEI() const override { return false; }
 };
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 78385ae1877b..239db2a74b24 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -78,7 +78,7 @@ public:
               CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
               CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
               CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
-              CPU != "c3" && CPU != "c3-2" && CPU != "lakemont";
+              CPU != "c3" && CPU != "c3-2" && CPU != "lakemont" && CPU != "";
   }
 
   unsigned getNumFixupKinds() const override {
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 08731cd0204c..7e7c35569093 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -137,7 +137,7 @@ def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
 def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
                       "Enable AVX-512 PreFetch Instructions",
                                       [FeatureAVX512]>;
-def FeaturePREFETCHWT1  : SubtargetFeature<"prefetchwt1", "HasPFPREFETCHWT1",
+def FeaturePREFETCHWT1  : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1",
                                    "true",
                                    "Prefetch with Intent to Write and T1 Hint">;
 def FeatureDQI     : SubtargetFeature<"avx512dq", "HasDQI", "true",
@@ -263,6 +263,12 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
 def FeatureSoftFloat
     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
                        "Use software floating point features.">;
+// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
+// using a variable mask over multiple fixed shuffles.
+def FeatureFastVariableShuffle
+    : SubtargetFeature<"fast-variable-shuffle",
+                       "HasFastVariableShuffle",
+                       "true", "Shuffles with variable masks are fast">;
 // On some X86 processors, there is no performance hazard to writing only the
 // lower parts of a YMM or ZMM register without clearing the upper part.
 def FeatureFastPartialYMMorZMMWrite
@@ -620,7 +626,8 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
   FeatureERMSB,
   FeatureFMA,
   FeatureLZCNT,
-  FeatureMOVBE
+  FeatureMOVBE,
+  FeatureFastVariableShuffle
 ]>;
 
 class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
@@ -632,7 +639,8 @@ def : HaswellProc<"core-avx2">; // Legacy alias.
 
 def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
   FeatureADX,
-  FeatureRDSEED
+  FeatureRDSEED,
+  FeaturePRFCHW
 ]>;
 class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
                                              BDWFeatures.Value, [
@@ -669,7 +677,8 @@ def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [
   FeatureLZCNT,
   FeatureBMI,
   FeatureBMI2,
-  FeatureFMA
+  FeatureFMA,
+  FeaturePRFCHW
 ]>;
 
 // FIXME: define KNL model
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index 0a87fb4533c2..ba7280c29cc9 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -301,60 +301,21 @@ typedef DenseMap<InstrConverterBaseKeyTy, InstrConverterBase *>
 /// different closure that manipulates the loaded or stored value.
 class Closure {
 private:
-  const TargetInstrInfo *TII;
-  MachineRegisterInfo *MRI;
-
   /// Virtual registers in the closure.
   DenseSet<unsigned> Edges;
 
   /// Instructions in the closure.
   SmallVector<MachineInstr *, 8> Instrs;
 
-  /// A map of available Instruction Converters.
-  const InstrConverterBaseMap &Converters;
-
-  /// The register domain of this closure.
-  RegDomain Domain;
-
   /// Domains which this closure can legally be reassigned to.
   std::bitset<NumDomains> LegalDstDomains;
 
-  /// Enqueue \p Reg to be considered for addition to the closure.
-  void visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist);
-
-  /// Add \p MI to this closure.
-  void encloseInstr(MachineInstr *MI);
-
-  /// Calculate the total cost of reassigning the closure to \p Domain.
-  double calculateCost(RegDomain Domain) const;
-
-  /// All edges that are included in some closure.
-  DenseSet<unsigned> &EnclosedEdges;
-
-  /// All instructions that are included in some closure.
-  DenseMap<MachineInstr *, Closure *> &EnclosedInstrs;
-
 public:
-  Closure(const TargetInstrInfo *TII, MachineRegisterInfo *MRI,
-          const InstrConverterBaseMap &Converters,
-          std::initializer_list<RegDomain> LegalDstDomainList,
-          DenseSet<unsigned> &EnclosedEdges,
-          DenseMap<MachineInstr *, Closure *> &EnclosedInstrs)
-      : TII(TII), MRI(MRI), Converters(Converters), Domain(NoDomain),
-        EnclosedEdges(EnclosedEdges), EnclosedInstrs(EnclosedInstrs) {
+  Closure(std::initializer_list<RegDomain> LegalDstDomainList) {
     for (RegDomain D : LegalDstDomainList)
       LegalDstDomains.set(D);
   }
 
-  /// Starting from \Reg, expand the closure as much as possible.
-  void buildClosure(unsigned E);
-
-  /// /returns true if it is profitable to reassign the closure to \p Domain.
-  bool isReassignmentProfitable(RegDomain Domain) const;
-
-  /// Reassign the closure to \p Domain.
-  void Reassign(RegDomain Domain) const;
-
   /// Mark this closure as illegal for reassignment to all domains.
   void setAllIllegal() { LegalDstDomains.reset(); }
 
@@ -364,10 +325,41 @@ public:
   /// \returns true if is legal to reassign this closure to domain \p RD.
   bool isLegal(RegDomain RD) const { return LegalDstDomains[RD]; }
 
+  /// Mark this closure as illegal for reassignment to domain \p RD.
+  void setIllegal(RegDomain RD) { LegalDstDomains[RD] = false; }
+
   bool empty() const { return Edges.empty(); }
+
+  bool insertEdge(unsigned Reg) {
+    return Edges.insert(Reg).second;
+  }
+
+  using const_edge_iterator = DenseSet<unsigned>::const_iterator;
+  iterator_range<const_edge_iterator> edges() const {
+    return iterator_range<const_edge_iterator>(Edges.begin(), Edges.end());
+  }
+
+  void addInstruction(MachineInstr *I) {
+    Instrs.push_back(I);
+  }
+
+  ArrayRef<MachineInstr *> instructions() const {
+    return Instrs;
+  }
+
 };
 
 class X86DomainReassignment : public MachineFunctionPass {
+  const X86Subtarget *STI;
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+
+  /// All edges that are included in some closure
+  DenseSet<unsigned> EnclosedEdges;
+
+  /// All instructions that are included in some closure.
+  DenseMap<MachineInstr *, Closure *> EnclosedInstrs;
+
 public:
   static char ID;
 
@@ -387,22 +379,39 @@ public:
   }
 
 private:
-  const X86Subtarget *STI;
-  MachineRegisterInfo *MRI;
-  const X86InstrInfo *TII;
-
   /// A map of available Instruction Converters.
   InstrConverterBaseMap Converters;
 
   /// Initialize Converters map.
   void initConverters();
+
+  /// Starting from \Reg, expand the closure as much as possible.
+  void buildClosure(Closure &, unsigned Reg);
+
+  /// Enqueue \p Reg to be considered for addition to the closure.
+  void visitRegister(Closure &, unsigned Reg, RegDomain &Domain,
+                     SmallVectorImpl<unsigned> &Worklist);
+
+  /// Reassign the closure to \p Domain.
+  void reassign(const Closure &C, RegDomain Domain) const;
+
+  /// Add \p MI to the closure.
+  void encloseInstr(Closure &C, MachineInstr *MI);
+
+  /// /returns true if it is profitable to reassign the closure to \p Domain.
+  bool isReassignmentProfitable(const Closure &C, RegDomain Domain) const;
+
+  /// Calculate the total cost of reassigning the closure to \p Domain.
+  double calculateCost(const Closure &C, RegDomain Domain) const;
 };
 
 char X86DomainReassignment::ID = 0;
 
 } // End anonymous namespace.
 
-void Closure::visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist) {
+void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg,
+                                          RegDomain &Domain,
+                                          SmallVectorImpl<unsigned> &Worklist) {
   if (EnclosedEdges.count(Reg))
     return;
 
@@ -423,59 +432,61 @@ void Closure::visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist) {
   Worklist.push_back(Reg);
 }
 
-void Closure::encloseInstr(MachineInstr *MI) {
+void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) {
   auto I = EnclosedInstrs.find(MI);
   if (I != EnclosedInstrs.end()) {
-    if (I->second != this)
+    if (I->second != &C)
       // Instruction already belongs to another closure, avoid conflicts between
       // closure and mark this closure as illegal.
-      setAllIllegal();
+      C.setAllIllegal();
     return;
   }
 
-  EnclosedInstrs[MI] = this;
-  Instrs.push_back(MI);
+  EnclosedInstrs[MI] = &C;
+  C.addInstruction(MI);
 
   // Mark closure as illegal for reassignment to domains, if there is no
   // converter for the instruction or if the converter cannot convert the
   // instruction.
-  for (unsigned i = 0; i != LegalDstDomains.size(); ++i) {
-    if (LegalDstDomains[i]) {
+  for (int i = 0; i != NumDomains; ++i) {
+    if (C.isLegal((RegDomain)i)) {
       InstrConverterBase *IC = Converters.lookup({i, MI->getOpcode()});
       if (!IC || !IC->isLegal(MI, TII))
-        LegalDstDomains[i] = false;
+        C.setIllegal((RegDomain)i);
     }
   }
 }
 
-double Closure::calculateCost(RegDomain DstDomain) const {
-  assert(isLegal(DstDomain) && "Cannot calculate cost for illegal closure");
+double X86DomainReassignment::calculateCost(const Closure &C,
+                                            RegDomain DstDomain) const {
+  assert(C.isLegal(DstDomain) && "Cannot calculate cost for illegal closure");
 
   double Cost = 0.0;
-  for (auto MI : Instrs)
+  for (auto *MI : C.instructions())
     Cost +=
         Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI);
   return Cost;
 }
 
-bool Closure::isReassignmentProfitable(RegDomain Domain) const {
-  return calculateCost(Domain) < 0.0;
+bool X86DomainReassignment::isReassignmentProfitable(const Closure &C,
+                                                     RegDomain Domain) const {
+  return calculateCost(C, Domain) < 0.0;
 }
 
-void Closure::Reassign(RegDomain Domain) const {
-  assert(isLegal(Domain) && "Cannot convert illegal closure");
+void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
+  assert(C.isLegal(Domain) && "Cannot convert illegal closure");
 
   // Iterate all instructions in the closure, convert each one using the
   // appropriate converter.
   SmallVector<MachineInstr *, 8> ToErase;
-  for (auto MI : Instrs)
+  for (auto *MI : C.instructions())
     if (Converters.lookup({Domain, MI->getOpcode()})
             ->convertInstr(MI, TII, MRI))
       ToErase.push_back(MI);
 
   // Iterate all registers in the closure, replace them with registers in the
   // destination domain.
-  for (unsigned Reg : Edges) {
+  for (unsigned Reg : C.edges()) {
     MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain));
     for (auto &MO : MRI->use_operands(Reg)) {
       if (MO.isReg())
@@ -512,18 +523,19 @@ static bool usedAsAddr(const MachineInstr &MI, unsigned Reg,
   return false;
 }
 
-void Closure::buildClosure(unsigned Reg) {
+void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
   SmallVector<unsigned, 4> Worklist;
-  visitRegister(Reg, Worklist);
+  RegDomain Domain = NoDomain;
+  visitRegister(C, Reg, Domain, Worklist);
   while (!Worklist.empty()) {
     unsigned CurReg = Worklist.pop_back_val();
 
     // Register already in this closure.
-    if (!Edges.insert(CurReg).second)
+    if (!C.insertEdge(CurReg))
       continue;
 
     MachineInstr *DefMI = MRI->getVRegDef(CurReg);
-    encloseInstr(DefMI);
+    encloseInstr(C, DefMI);
 
     // Add register used by the defining MI to the worklist.
     // Do not add registers which are used in address calculation, they will be
@@ -542,7 +554,7 @@ void Closure::buildClosure(unsigned Reg) {
       auto &Op = DefMI->getOperand(OpIdx);
       if (!Op.isReg() || !Op.isUse())
         continue;
-      visitRegister(Op.getReg(), Worklist);
+      visitRegister(C, Op.getReg(), Domain, Worklist);
     }
 
     // Expand closure through register uses.
@@ -550,10 +562,10 @@ void Closure::buildClosure(unsigned Reg) {
       // We would like to avoid converting closures which calculare addresses,
       // as this should remain in GPRs.
       if (usedAsAddr(UseMI, CurReg, TII)) {
-        setAllIllegal();
+        C.setAllIllegal();
         continue;
       }
-      encloseInstr(&UseMI);
+      encloseInstr(C, &UseMI);
 
       for (auto &DefOp : UseMI.defs()) {
         if (!DefOp.isReg())
@@ -561,10 +573,10 @@ void Closure::buildClosure(unsigned Reg) {
 
         unsigned DefReg = DefOp.getReg();
         if (!TargetRegisterInfo::isVirtualRegister(DefReg)) {
-          setAllIllegal();
+          C.setAllIllegal();
           continue;
         }
-        visitRegister(DefReg, Worklist);
+        visitRegister(C, DefReg, Domain, Worklist);
       }
     }
   }
@@ -701,8 +713,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
   initConverters();
   bool Changed = false;
 
-  DenseSet<unsigned> EnclosedEdges;
-  DenseMap<MachineInstr *, Closure *> EnclosedInstrs;
+  EnclosedEdges.clear();
+  EnclosedInstrs.clear();
 
   std::vector<Closure> Closures;
 
@@ -719,9 +731,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
       continue;
 
     // Calculate closure starting with Reg.
-    Closure C(TII, MRI, Converters, {MaskDomain}, EnclosedEdges,
-              EnclosedInstrs);
-    C.buildClosure(Reg);
+    Closure C({MaskDomain});
+    buildClosure(C, Reg);
 
     // Collect all closures that can potentially be converted.
     if (!C.empty() && C.isLegal(MaskDomain))
@@ -729,8 +740,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
   }
 
   for (Closure &C : Closures)
-    if (C.isReassignmentProfitable(MaskDomain)) {
-      C.Reassign(MaskDomain);
+    if (isReassignmentProfitable(C, MaskDomain)) {
+      reassign(C, MaskDomain);
       ++NumClosuresConverted;
       Changed = true;
     }
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index a6c7c5f22a3a..660c1eff3c4b 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -106,14 +106,15 @@ namespace {
       if (Base_Reg.getNode())
         Base_Reg.getNode()->dump();
       else
-        dbgs() << "nul";
-      dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
-             << " Scale" << Scale << '\n'
+        dbgs() << "nul\n";
+      if (BaseType == FrameIndexBase)
+        dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
+      dbgs() << " Scale " << Scale << '\n'
              << "IndexReg ";
       if (IndexReg.getNode())
         IndexReg.getNode()->dump();
       else
-        dbgs() << "nul";
+        dbgs() << "nul\n";
       dbgs() << " Disp " << Disp << '\n'
              << "GV ";
       if (GV)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index a72f4daa5e11..5ac5d0348f8a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -461,7 +461,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SRL_PARTS, VT, Custom);
   }
 
-  if (Subtarget.hasSSE1())
+  if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 
   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
@@ -1622,16 +1622,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setLibcallName(RTLIB::MUL_I128, nullptr);
   }
 
-  // Combine sin / cos into one node or libcall if possible.
-  if (Subtarget.hasSinCos()) {
-    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
-    setLibcallName(RTLIB::SINCOS_F64, "sincos");
-    if (Subtarget.isTargetDarwin()) {
-      // For MacOSX, we don't want the normal expansion of a libcall to sincos.
-      // We want to issue a libcall to __sincos_stret to avoid memory traffic.
-      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-    }
+  // Combine sin / cos into _sincos_stret if it is available.
+  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
+      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
+    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   }
 
   if (Subtarget.isTargetWin64()) {
@@ -7480,9 +7475,9 @@ static bool isAddSub(const BuildVectorSDNode *BV,
 }
 
 /// Returns true if is possible to fold MUL and an idiom that has already been
-/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
-/// If (and only if) true is returned, the operands of FMADDSUB are written to
-/// parameters \p Opnd0, \p Opnd1, \p Opnd2.
+/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into 
+/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
+/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
 ///
 /// Prior to calling this function it should be known that there is some
 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
@@ -7505,12 +7500,12 @@ static bool isAddSub(const BuildVectorSDNode *BV,
 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
 /// FMADDSUB is.
-static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
-                       SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
-                       unsigned ExpectedUses) {
+static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
+                                 SelectionDAG &DAG,
+                                 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
+                                 unsigned ExpectedUses) {
   if (Opnd0.getOpcode() != ISD::FMUL ||
-      !Opnd0->hasNUsesOfValue(ExpectedUses, 0) ||
-      !Subtarget.hasAnyFMA())
+      !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
     return false;
 
   // FIXME: These checks must match the similar ones in
@@ -7547,7 +7542,7 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
   SDValue Opnd2;
   // TODO: According to coverage reports, the FMADDSUB transform is not
   // triggered by any tests.
-  if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
+  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
     return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
 
   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
@@ -11958,6 +11953,19 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
   return 0;
 }
 
+static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
+                                           ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, SelectionDAG &DAG) {
+  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+  SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+  if (V2.isUndef())
+    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+}
+
 /// \brief Generic lowering of v16i8 shuffles.
 ///
 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
@@ -12148,6 +12156,10 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
               DL, MVT::v16i8, V1, V2, Mask, DAG))
         return Unpack;
+
+      // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
+      if (Subtarget.hasVBMI() && Subtarget.hasVLX())
+        return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
     }
 
     return PSHUFB;
@@ -13048,19 +13060,6 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
                      DAG.getConstant(Immediate, DL, MVT::i8));
 }
 
-static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
-                                           ArrayRef<int> Mask, SDValue V1,
-                                           SDValue V2, SelectionDAG &DAG) {
-  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
-  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
-
-  SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
-  if (V2.isUndef())
-    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
-
-  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
-}
-
 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
 ///
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -13615,6 +13614,10 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
     return PSHUFB;
 
+  // AVX512VBMIVL can lower to VPERMB.
+  if (Subtarget.hasVBMI() && Subtarget.hasVLX())
+    return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
+
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
@@ -14077,6 +14080,10 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
+  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
+          DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
+    return PSHUFB;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
 }
 
@@ -14212,7 +14219,9 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     ExtVT = MVT::v4i32;
     break;
   case MVT::v8i1:
-    ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
+    // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
+    // shuffle.
+    ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
     break;
   case MVT::v16i1:
     ExtVT = MVT::v16i32;
@@ -14569,11 +14578,10 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
     unsigned NumElts = VecVT.getVectorNumElements();
     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
     // than extending to 128/256bit.
-    unsigned VecSize = (NumElts <= 4 ? 128 : 512);
-    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts);
-    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
-    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                              ExtVT.getVectorElementType(), Ext, Idx);
+    MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
+    MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
+    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   }
 
@@ -14768,12 +14776,11 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
     // Non constant index. Extend source and destination,
     // insert element and then truncate the result.
     unsigned NumElts = VecVT.getVectorNumElements();
-    unsigned VecSize = (NumElts <= 4 ? 128 : 512);
-    MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
-    MVT ExtEltVT = ExtVecVT.getVectorElementType();
+    MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
+    MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
-      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
-      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
+      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   }
 
@@ -16287,21 +16294,6 @@ static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
   return SelectedVal;
 }
 
-static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
-                               SelectionDAG &DAG) {
-  SDValue In = Op->getOperand(0);
-  MVT InVT = In.getSimpleValueType();
-
-  if (InVT.getVectorElementType() == MVT::i1)
-    return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
-
-  if (Subtarget.hasFp256())
-    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
-      return Res;
-
-  return SDValue();
-}
-
 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   SDValue In = Op.getOperand(0);
@@ -16440,7 +16432,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
            "Unexpected vector type.");
     unsigned NumElts = InVT.getVectorNumElements();
-    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+    MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
+    MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
     InVT = ExtVT;
     ShiftInx = InVT.getScalarSizeInBits() - 1;
@@ -18446,6 +18439,21 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
   return V;
 }
 
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
+                               SelectionDAG &DAG) {
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+
+  if (InVT.getVectorElementType() == MVT::i1)
+    return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
+
+  if (Subtarget.hasFp256())
+    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
+      return Res;
+
+  return SDValue();
+}
+
 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
 // For sign extend this needs to handle all vector sizes and SSE4.1 and
 // non-SSE4.1 targets. For zero extend this should only handle inputs of
@@ -21128,7 +21136,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   // ADC/ADCX/SBB
   case ADX: {
     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
-    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
+    SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
                                 DAG.getConstant(-1, dl, MVT::i8));
     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
@@ -22231,6 +22239,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
     }
 
+    assert(VT == MVT::v16i8 && "Unexpected VT");
+
     SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
     SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
@@ -22989,12 +22999,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
       (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
       (Subtarget.hasBWI() && VT == MVT::v32i8)) {
-    MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
+    assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
+           "Unexpected vector type");
+    MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
     unsigned ExtOpc =
         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
-    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
+    Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
     return DAG.getNode(ISD::TRUNCATE, dl, VT,
                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
   }
@@ -24101,8 +24113,9 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   // the results are returned via SRet in memory.
-  const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
+  const char *LibcallName = TLI.getLibcallName(LC);
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
@@ -24928,7 +24941,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::BITCAST: {
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
     EVT DstVT = N->getValueType(0);
-    EVT SrcVT = N->getOperand(0)->getValueType(0);
+    EVT SrcVT = N->getOperand(0).getValueType();
 
     if (SrcVT != MVT::f64 ||
         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
@@ -28407,8 +28420,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // TODO - attempt to narrow Mask back to writemask size.
   bool IsEVEXShuffle =
       RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
-  if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
-    return SDValue();
 
   // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
 
@@ -28491,11 +28502,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
     if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
                                 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
-                                ShuffleVT)) {
+                                ShuffleVT) &&
+        (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
-      if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
-        return SDValue(); // AVX512 Writemask clash.
       Res = DAG.getBitcast(ShuffleSrcVT, V1);
       DCI.AddToWorklist(Res.getNode());
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
@@ -28505,11 +28515,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
     if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
                                        AllowIntDomain, Subtarget, Shuffle,
-                                       ShuffleVT, PermuteImm)) {
+                                       ShuffleVT, PermuteImm) &&
+        (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
-      if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
-        return SDValue(); // AVX512 Writemask clash.
       Res = DAG.getBitcast(ShuffleVT, V1);
       DCI.AddToWorklist(Res.getNode());
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
@@ -28520,12 +28529,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   }
 
   if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
-                               V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
-                               ShuffleVT, UnaryShuffle)) {
+                               V1, V2, DL, DAG, Subtarget, Shuffle,
+                               ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
+      (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
-    if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
-      return SDValue(); // AVX512 Writemask clash.
     V1 = DAG.getBitcast(ShuffleSrcVT, V1);
     DCI.AddToWorklist(V1.getNode());
     V2 = DAG.getBitcast(ShuffleSrcVT, V2);
@@ -28538,11 +28546,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
                                       AllowIntDomain, V1, V2, DL, DAG,
                                       Subtarget, Shuffle, ShuffleVT,
-                                      PermuteImm)) {
+                                      PermuteImm) &&
+      (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
-    if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
-      return SDValue(); // AVX512 Writemask clash.
     V1 = DAG.getBitcast(ShuffleVT, V1);
     DCI.AddToWorklist(V1.getNode());
     V2 = DAG.getBitcast(ShuffleVT, V2);
@@ -28594,8 +28601,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     return SDValue();
 
   // Depth threshold above which we can efficiently use variable mask shuffles.
-  // TODO This should probably be target specific.
-  bool AllowVariableMask = (Depth >= 3) || HasVariableMask;
+  int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
+  bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
 
   bool MaskContainsZeros =
       any_of(Mask, [](int M) { return M == SM_SentinelZero; });
@@ -29698,17 +29705,18 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// Returns true iff the shuffle node \p N can be replaced with ADDSUB
-/// operation. If true is returned then the operands of ADDSUB operation
+/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
+/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
 /// are written to the parameters \p Opnd0 and \p Opnd1.
 ///
-/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
+/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
 /// so it is easier to generically match. We also insert dummy vector shuffle
 /// nodes for the operands which explicitly discard the lanes which are unused
 /// by this operation to try to flow through the rest of the combiner
 /// the fact that they're unused.
-static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
-                     SDValue &Opnd0, SDValue &Opnd1) {
+static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
+                             SDValue &Opnd0, SDValue &Opnd1, 
+                             bool matchSubAdd = false) {
 
   EVT VT = N->getValueType(0);
   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
@@ -29728,12 +29736,15 @@ static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
   SDValue V1 = N->getOperand(0);
   SDValue V2 = N->getOperand(1);
 
-  // We require the first shuffle operand to be the FSUB node, and the second to
-  // be the FADD node.
-  if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
+  unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB;
+  unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD;
+
+  // We require the first shuffle operand to be the ExpectedOpcode node,
+  // and the second to be the NextExpectedOpcode node.
+  if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {
     ShuffleVectorSDNode::commuteMask(Mask);
     std::swap(V1, V2);
-  } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
+  } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode)
     return false;
 
   // If there are other uses of these operations we can't fold them.
@@ -29767,7 +29778,7 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
                                                 const X86Subtarget &Subtarget,
                                                 SelectionDAG &DAG) {
   SDValue Opnd0, Opnd1;
-  if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
+  if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))
     return SDValue();
 
   EVT VT = N->getValueType(0);
@@ -29775,7 +29786,7 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
 
   // Try to generate X86ISD::FMADDSUB node here.
   SDValue Opnd2;
-  if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
+  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
     return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
 
   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
@@ -29787,6 +29798,26 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
+/// \brief Try to combine a shuffle into a target-specific
+/// mul-sub-add node.
+static SDValue combineShuffleToFMSubAdd(SDNode *N,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  SDValue Opnd0, Opnd1;
+  if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Try to generate X86ISD::FMSUBADD node here.
+  SDValue Opnd2;
+  if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
+    return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2);
+
+  return SDValue();
+}
+
 // We are looking for a shuffle where both sources are concatenated with undef
 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
 // if we can express this as a single-source shuffle, that's preferable.
@@ -29873,11 +29904,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   // If we have legalized the vector types, look for blends of FADD and FSUB
-  // nodes that we can fuse into an ADDSUB node.
+  // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
   if (TLI.isTypeLegal(VT)) {
     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
       return AddSub;
 
+    if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG))
+      return FMSubAdd;
+
     if (SDValue HAddSub = foldShuffleOfHorizOp(N))
       return HAddSub;
   }
@@ -30181,7 +30215,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
     // sign-extend to a 256-bit operation to avoid truncation.
     if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
-        N0->getOperand(0)->getValueType(0).is256BitVector()) {
+        N0->getOperand(0).getValueType().is256BitVector()) {
       SExtVT = MVT::v4i64;
       FPCastVT = MVT::v4f64;
     }
@@ -30194,8 +30228,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
     // 256-bit because the shuffle is cheaper than sign extending the result of
     // the compare.
     if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
-        (N0->getOperand(0)->getValueType(0).is256BitVector() ||
-         N0->getOperand(0)->getValueType(0).is512BitVector())) {
+        (N0->getOperand(0).getValueType().is256BitVector() ||
+         N0->getOperand(0).getValueType().is512BitVector())) {
       SExtVT = MVT::v8i32;
       FPCastVT = MVT::v8f32;
     }
@@ -30484,7 +30518,8 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
   return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
 }
 
-// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW.
+// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
+// PHMINPOSUW.
 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
                                              const X86Subtarget &Subtarget) {
   // Bail without SSE41.
@@ -30492,7 +30527,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
     return SDValue();
 
   EVT ExtractVT = Extract->getValueType(0);
-  if (ExtractVT != MVT::i16)
+  if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
     return SDValue();
 
   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
@@ -30504,7 +30539,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
 
   EVT SrcVT = Src.getValueType();
   EVT SrcSVT = SrcVT.getScalarType();
-  if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0)
+  if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
     return SDValue();
 
   SDLoc DL(Extract);
@@ -30520,22 +30555,39 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
     SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
   }
-  assert(SrcVT == MVT::v8i16 && "Unexpected value type");
+  assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
+          (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
+         "Unexpected value type");
 
   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
   // to flip the value accordingly.
   SDValue Mask;
+  unsigned MaskEltsBits = ExtractVT.getSizeInBits();
   if (BinOp == ISD::SMAX)
-    Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT);
+    Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
   else if (BinOp == ISD::SMIN)
-    Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT);
+    Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
   else if (BinOp == ISD::UMAX)
-    Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT);
+    Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
 
   if (Mask)
     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
 
-  MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos);
+  // For v16i8 cases we need to perform UMIN on pairs of byte elements,
+  // shuffling each upper element down and insert zeros. This means that the
+  // v16i8 UMIN will leave the upper element as zero, performing zero-extension
+  // ready for the PHMINPOS.
+  if (ExtractVT == MVT::i8) {
+    SDValue Upper = DAG.getVectorShuffle(
+        SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
+        {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
+    MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
+  }
+
+  // Perform the PHMINPOS on a v8i16 vector,
+  MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
+  MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
+  MinPos = DAG.getBitcast(SrcVT, MinPos);
 
   if (Mask)
     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
@@ -30851,7 +30903,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
     return Cmp;
 
-  // Attempt to replace min/max v8i16 reductions with PHMINPOSUW.
+  // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
   if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
     return MinMax;
 
@@ -32555,7 +32607,7 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
   // 1. MOVs can write to a register that differs from source
   // 2. MOVs accept memory operands
 
-  if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
+  if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
       N0.getOperand(1).getOpcode() != ISD::Constant)
     return SDValue();
@@ -32569,11 +32621,11 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
   if (SarConst.isNegative())
     return SDValue();
 
-  for (MVT SVT : MVT::integer_valuetypes()) {
+  for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
     unsigned ShiftSize = SVT.getSizeInBits();
     // skipping types without corresponding sext/zext and
     // ShlConst that is not one of [56,48,32,24,16]
-    if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
+    if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
       continue;
     SDLoc DL(N);
     SDValue NN =
@@ -32626,37 +32678,6 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-/// \brief Returns a vector of 0s if the node in input is a vector logical
-/// shift by a constant amount which is known to be bigger than or equal
-/// to the vector element size in bits.
-static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget &Subtarget) {
-  EVT VT = N->getValueType(0);
-
-  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
-      (!Subtarget.hasInt256() ||
-       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
-    return SDValue();
-
-  SDValue Amt = N->getOperand(1);
-  SDLoc DL(N);
-  if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
-    if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
-      const APInt &ShiftAmt = AmtSplat->getAPIntValue();
-      unsigned MaxAmount =
-        VT.getSimpleVT().getScalarSizeInBits();
-
-      // SSE2/AVX2 logical shifts always return a vector of 0s
-      // if the shift amount is bigger than or equal to
-      // the element size. The constant shift amount will be
-      // encoded as a 8-bit immediate.
-      if (ShiftAmt.trunc(8).uge(MaxAmount))
-        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
-    }
-
-  return SDValue();
-}
-
 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
@@ -32672,11 +32693,6 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
     if (SDValue V = combineShiftRightLogical(N, DAG))
       return V;
 
-  // Try to fold this logical shift into a zero vector.
-  if (N->getOpcode() != ISD::SRA)
-    if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
-      return V;
-
   return SDValue();
 }
 
@@ -32996,21 +33012,20 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
 // register. In most cases we actually compare or select YMM-sized registers
 // and mixing the two types creates horrible code. This method optimizes
 // some of the transition sequences.
+// Even with AVX-512 this is still useful for removing casts around logical
+// operations on vXi1 mask types.
 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
-  if (!VT.is256BitVector())
-    return SDValue();
+  assert(VT.isVector() && "Expected vector type");
 
   assert((N->getOpcode() == ISD::ANY_EXTEND ||
           N->getOpcode() == ISD::ZERO_EXTEND ||
           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
 
   SDValue Narrow = N->getOperand(0);
-  EVT NarrowVT = Narrow->getValueType(0);
-  if (!NarrowVT.is128BitVector())
-    return SDValue();
+  EVT NarrowVT = Narrow.getValueType();
 
   if (Narrow->getOpcode() != ISD::XOR &&
       Narrow->getOpcode() != ISD::AND &&
@@ -33026,12 +33041,12 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // The type of the truncated inputs.
-  EVT WideVT = N0->getOperand(0)->getValueType(0);
-  if (WideVT != VT)
+  if (N0->getOperand(0).getValueType() != VT)
     return SDValue();
 
   // The right side has to be a 'trunc' or a constant vector.
-  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
+  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
+                  N1.getOperand(0).getValueType() == VT;
   ConstantSDNode *RHSConstSplat = nullptr;
   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
     RHSConstSplat = RHSBV->getConstantSplatNode();
@@ -33040,37 +33055,31 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
-  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
+  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
     return SDValue();
 
   // Set N0 and N1 to hold the inputs to the new wide operation.
   N0 = N0->getOperand(0);
   if (RHSConstSplat) {
-    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
+    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT.getVectorElementType(),
                      SDValue(RHSConstSplat, 0));
-    N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
+    N1 = DAG.getSplatBuildVector(VT, DL, N1);
   } else if (RHSTrunc) {
     N1 = N1->getOperand(0);
   }
 
   // Generate the wide operation.
-  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
+  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
   unsigned Opcode = N->getOpcode();
   switch (Opcode) {
+  default: llvm_unreachable("Unexpected opcode");
   case ISD::ANY_EXTEND:
     return Op;
-  case ISD::ZERO_EXTEND: {
-    unsigned InBits = NarrowVT.getScalarSizeInBits();
-    APInt Mask = APInt::getAllOnesValue(InBits);
-    Mask = Mask.zext(VT.getScalarSizeInBits());
-    return DAG.getNode(ISD::AND, DL, VT,
-                       Op, DAG.getConstant(Mask, DL, VT));
-  }
+  case ISD::ZERO_EXTEND:
+    return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
   case ISD::SIGN_EXTEND:
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
                        Op, DAG.getValueType(NarrowVT));
-  default:
-    llvm_unreachable("Unexpected opcode");
   }
 }
 
@@ -33882,16 +33891,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
 
   if (!Subtarget.hasSSE2())
     return SDValue();
-  if (Subtarget.hasBWI()) {
-    if (VT.getSizeInBits() > 512)
-      return SDValue();
-  } else if (Subtarget.hasAVX2()) {
-    if (VT.getSizeInBits() > 256)
-      return SDValue();
-  } else {
-    if (VT.getSizeInBits() > 128)
-      return SDValue();
-  }
 
   // Detect the following pattern:
   //
@@ -33903,7 +33902,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   //   %6 = trunc <N x i32> %5 to <N x i8>
   //
   // In AVX512, the last instruction can also be a trunc store.
-
   if (In.getOpcode() != ISD::SRL)
     return SDValue();
 
@@ -33924,6 +33922,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
     return true;
   };
 
+  // Split vectors to legal target size and apply AVG.
+  auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
+    unsigned NumSubs = 1;
+    if (Subtarget.hasBWI()) {
+      if (VT.getSizeInBits() > 512)
+        NumSubs = VT.getSizeInBits() / 512;
+    } else if (Subtarget.hasAVX2()) {
+      if (VT.getSizeInBits() > 256)
+        NumSubs = VT.getSizeInBits() / 256;
+    } else {
+      if (VT.getSizeInBits() > 128)
+        NumSubs = VT.getSizeInBits() / 128;
+    }
+
+    if (NumSubs == 1)
+      return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);
+
+    SmallVector<SDValue, 4> Subs;
+    EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
+                                 VT.getVectorNumElements() / NumSubs);
+    for (unsigned i = 0; i != NumSubs; ++i) {
+      unsigned Idx = i * SubVT.getVectorNumElements();
+      SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
+      SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
+      Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
+    }
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
+  };
+
   // Check if each element of the vector is left-shifted by one.
   auto LHS = In.getOperand(0);
   auto RHS = In.getOperand(1);
@@ -33947,8 +33974,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
-    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
-                       Operands[1]);
+    return LowerToAVG(Operands[0].getOperand(0), Operands[1]);
   }
 
   if (Operands[0].getOpcode() == ISD::ADD)
@@ -33972,8 +33998,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
         return SDValue();
 
     // The pattern is detected, emit X86ISD::AVG instruction.
-    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
-                       Operands[1].getOperand(0));
+    return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));
   }
 
   return SDValue();
@@ -35872,14 +35897,8 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
     return NewCMov;
 
-  if (!DCI.isBeforeLegalizeOps()) {
-    if (InVT == MVT::i1) {
-      SDValue Zero = DAG.getConstant(0, DL, VT);
-      SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
-      return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
-    }
+  if (!DCI.isBeforeLegalizeOps())
     return SDValue();
-  }
 
   if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
       isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
@@ -35897,7 +35916,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
-  if (Subtarget.hasAVX() && VT.is256BitVector())
+  if (VT.isVector())
     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
       return R;
 
@@ -36089,7 +36108,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
-  if (VT.is256BitVector())
+  if (VT.isVector())
     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
       return R;
 
@@ -36244,39 +36263,54 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
   SDLoc DL(N);
 
-  // Pre-shrink oversized index elements to avoid triggering scalarization.
-  if (DCI.isBeforeLegalize()) {
+  if (DCI.isBeforeLegalizeOps()) {
     SDValue Index = N->getOperand(4);
-    if (Index.getScalarValueSizeInBits() > 64) {
-      EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64,
+    // Remove any sign extends from 32 or smaller to larger than 32.
+    // Only do this before LegalizeOps in case we need the sign extend for
+    // legalization.
+    if (Index.getOpcode() == ISD::SIGN_EXTEND) {
+      if (Index.getScalarValueSizeInBits() > 32 &&
+          Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
+        SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+        NewOps[4] = Index.getOperand(0);
+        DAG.UpdateNodeOperands(N, NewOps);
+        // The original sign extend has less users, add back to worklist in case
+        // it needs to be removed
+        DCI.AddToWorklist(Index.getNode());
+        DCI.AddToWorklist(N);
+        return SDValue(N, 0);
+      }
+    }
+
+    // Make sure the index is either i32 or i64
+    unsigned ScalarSize = Index.getScalarValueSizeInBits();
+    if (ScalarSize != 32 && ScalarSize != 64) {
+      MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
+      EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
                                    Index.getValueType().getVectorNumElements());
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
+      Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
       SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
-      NewOps[4] = Trunc;
+      NewOps[4] = Index;
       DAG.UpdateNodeOperands(N, NewOps);
       DCI.AddToWorklist(N);
       return SDValue(N, 0);
     }
-  }
 
-  // Try to remove sign extends from i32 to i64 on the index.
-  // Only do this before legalize in case we are relying on it for
-  // legalization.
-  // TODO: We should maybe remove any sign extend once we learn how to sign
-  // extend narrow index during lowering.
-  if (DCI.isBeforeLegalizeOps()) {
-    SDValue Index = N->getOperand(4);
-    if (Index.getScalarValueSizeInBits() == 64 &&
-        Index.getOpcode() == ISD::SIGN_EXTEND &&
+    // Try to remove zero extends from 32->64 if we know the sign bit of
+    // the input is zero.
+    if (Index.getOpcode() == ISD::ZERO_EXTEND &&
+        Index.getScalarValueSizeInBits() == 64 &&
         Index.getOperand(0).getScalarValueSizeInBits() == 32) {
-      SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
-      NewOps[4] = Index.getOperand(0);
-      DAG.UpdateNodeOperands(N, NewOps);
-      // The original sign extend has less users, add back to worklist in case
-      // it needs to be removed.
-      DCI.AddToWorklist(Index.getNode());
-      DCI.AddToWorklist(N);
-      return SDValue(N, 0);
+      if (DAG.SignBitIsZero(Index.getOperand(0))) {
+        SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+        NewOps[4] = Index.getOperand(0);
+        DAG.UpdateNodeOperands(N, NewOps);
+        // The original zero extend has less users, add back to worklist in case
+        // it needs to be removed
+        DCI.AddToWorklist(Index.getNode());
+        DCI.AddToWorklist(N);
+        return SDValue(N, 0);
+      }
     }
   }
 
@@ -36288,6 +36322,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
     SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
     NewOps[2] = Mask.getOperand(0);
     DAG.UpdateNodeOperands(N, NewOps);
+    return SDValue(N, 0);
   }
 
   // With AVX2 we only demand the upper bit of the mask.
@@ -36356,7 +36391,7 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
   EVT VT = N->getValueType(0);
   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
-      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+      VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
     return SDValue();
 
   // Now check that the other operand of the AND is a constant. We could
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index 2acd8d17beb2..0d30b7d47f3e 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -116,14 +116,30 @@ defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw", I3DNOW_MISC_FUNC_ITINS, 1>;
 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
                    [(int_x86_mmx_femms)], IIC_MMX_EMMS>;
 
+// PREFETCHWT1 is supported we want to use it for everything but T0.
+def PrefetchWLevel : PatFrag<(ops), (i32 imm), [{
+  return N->getSExtValue() == 3 || !Subtarget->hasPREFETCHWT1();
+}]>;
+
+// Use PREFETCHWT1 for NTA, T2, T1.
+def PrefetchWT1Level : ImmLeaf<i32, [{
+  return Imm < 3;
+}]>;
+
 let SchedRW = [WriteLoad] in {
+let Predicates = [Has3DNow, NoSSEPrefetch] in
 def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
                       "prefetch\t$addr",
-                      [(prefetch addr:$addr, (i32 0), imm, (i32 1))],
+                      [(prefetch addr:$addr, imm, imm, (i32 1))],
                       IIC_SSE_PREFETCH>;
+
 def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
-                  [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))],
+                  [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))],
                   IIC_SSE_PREFETCH>, TB, Requires<[HasPrefetchW]>;
+
+def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr",
+                    [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))],
+                    IIC_SSE_PREFETCH>, TB, Requires<[HasPREFETCHWT1]>;
 }
 
 // "3DNowA" instructions
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 2a6ed02fadab..0b266e5591b4 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -349,8 +349,9 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{54}    = hasEVEX_RC;
 }
 
-class PseudoI<dag oops, dag iops, list<dag> pattern>
-  : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> {
+class PseudoI<dag oops, dag iops, list<dag> pattern,
+              InstrItinClass itin = NoItinerary>
+  : X86Inst<0, Pseudo, NoImm, oops, iops, "", itin> {
   let Pattern = pattern;
 }
 
@@ -423,9 +424,8 @@ class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
 // FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
 class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
            InstrItinClass itin = NoItinerary>
-  : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> {
+  : PseudoI<outs, ins, pattern, itin> {
   let FPForm = fp;
-  let Pattern = pattern;
 }
 
 // Templates for instructions that use a 16- or 32-bit segmented address as
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 42e89cb4831d..fdf3e73e4fcd 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -874,7 +874,10 @@ def HasADX       : Predicate<"Subtarget->hasADX()">;
 def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
+def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">;
+def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;
 def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;
 def HasCLZERO    : Predicate<"Subtarget->hasCLZERO()">;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index a86a0bfc168d..b48fa1841979 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3487,7 +3487,7 @@ let Predicates = [UseSSE2] in {
 //===----------------------------------------------------------------------===//
 
 // Prefetch intrinsic.
-let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
+let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
 def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
     "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
     IIC_SSE_PREFETCH>, TB;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 1e04997ad294..e131f1a1e4bd 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -89,8 +89,9 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
 
-    if (const char *bzeroEntry = ValC &&
-        ValC->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
+    if (const char *bzeroName = (ValC && ValC->isNullValue())
+        ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
+        : nullptr) {
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
       Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
@@ -106,7 +107,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
       CLI.setDebugLoc(dl)
           .setChain(Chain)
           .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                        DAG.getExternalSymbol(bzeroEntry, IntPtr),
+                        DAG.getExternalSymbol(bzeroName, IntPtr),
                         std::move(Args))
           .setDiscardResult();
 
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 8b08766b6171..ad023623142f 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -174,28 +174,6 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
   return X86II::MO_NO_FLAG;
 }
 
-/// This function returns the name of a function which has an interface like
-/// the non-standard bzero function, if such a function exists on the
-/// current subtarget and it is considered preferable over memset with zero
-/// passed as the second argument. Otherwise it returns null.
-const char *X86Subtarget::getBZeroEntry() const {
-  // Darwin 10 has a __bzero entry point for this purpose.
-  if (getTargetTriple().isMacOSX() &&
-      !getTargetTriple().isMacOSXVersionLT(10, 6))
-    return "__bzero";
-
-  return nullptr;
-}
-
-bool X86Subtarget::hasSinCos() const {
-  if (getTargetTriple().isMacOSX()) {
-    return !getTargetTriple().isMacOSXVersionLT(10, 9) && is64Bit();
-  } else if (getTargetTriple().isOSFuchsia()) {
-    return true;
-  }
-  return false;
-}
-
 /// Return true if the subtarget allows calls to immediate address.
 bool X86Subtarget::isLegalToCallImmediateAddr() const {
   // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
@@ -324,6 +302,7 @@ void X86Subtarget::initializeEnvironment() {
   HasVNNI = false;
   HasBITALG = false;
   HasSHA = false;
+  HasPREFETCHWT1 = false;
   HasPRFCHW = false;
   HasRDSEED = false;
   HasLAHFSAHF = false;
@@ -342,6 +321,7 @@ void X86Subtarget::initializeEnvironment() {
   HasSSEUnalignedMem = false;
   HasCmpxchg16b = false;
   UseLeaForSP = false;
+  HasFastVariableShuffle = false;
   HasFastPartialYMMorZMMWrite = false;
   HasFastGather = false;
   HasFastScalarFSQRT = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index be4d46c470de..c9435890fc1f 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -201,7 +201,7 @@ protected:
   bool HasCLZERO;
 
   /// Processor has Prefetch with intent to Write instruction
-  bool HasPFPREFETCHWT1;
+  bool HasPREFETCHWT1;
 
   /// True if SHLD instructions are slow.
   bool IsSHLDSlow;
@@ -228,6 +228,10 @@ protected:
   /// the stack pointer. This is an optimization for Intel Atom processors.
   bool UseLeaForSP;
 
+  /// True if its preferable to combine to a single shuffle using a variable
+  /// mask over multiple fixed shuffles.
+  bool HasFastVariableShuffle;
+
   /// True if there is no performance penalty to writing only the lower parts
   /// of a YMM or ZMM register without clearing the upper part.
   bool HasFastPartialYMMorZMMWrite;
@@ -513,7 +517,14 @@ public:
   bool hasRTM() const { return HasRTM; }
   bool hasADX() const { return HasADX; }
   bool hasSHA() const { return HasSHA; }
-  bool hasPRFCHW() const { return HasPRFCHW; }
+  bool hasPRFCHW() const { return HasPRFCHW || HasPREFETCHWT1; }
+  bool hasPREFETCHWT1() const { return HasPREFETCHWT1; }
+  bool hasSSEPrefetch() const {
+    // We implicitly enable these when we have a write prefix supporting cache
+    // level OR if we have prfchw, but don't already have a read prefetch from
+    // 3dnow.
+    return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1();
+  }
   bool hasRDSEED() const { return HasRDSEED; }
   bool hasLAHFSAHF() const { return HasLAHFSAHF; }
   bool hasMWAITX() const { return HasMWAITX; }
@@ -527,6 +538,9 @@ public:
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasFastVariableShuffle() const {
+    return HasFastVariableShuffle;
+  }
   bool hasFastPartialYMMorZMMWrite() const {
     return HasFastPartialYMMorZMMWrite;
   }
@@ -682,17 +696,6 @@ public:
   /// Return true if the subtarget allows calls to immediate address.
   bool isLegalToCallImmediateAddr() const;
 
-  /// This function returns the name of a function which has an interface
-  /// like the non-standard bzero function, if such a function exists on
-  /// the current subtarget and it is considered prefereable over
-  /// memset with zero passed as the second argument. Otherwise it
-  /// returns null.
-  const char *getBZeroEntry() const;
-
-  /// This function returns true if the target has sincos() routine in its
-  /// compiler runtime or math libraries.
-  bool hasSinCos() const;
-
   /// Enable the MachineScheduler pass for all X86 subtargets.
   bool enableMachineScheduler() const override { return true; }
 
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index ea8c9862230e..e95e6ecae091 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -281,10 +281,9 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
 // X86 TTI query.
 //===----------------------------------------------------------------------===//
 
-TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(X86TTIImpl(this, F));
-  });
+TargetTransformInfo
+X86TargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(X86TTIImpl(this, F));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 952bd1321ff9..5b21cd82b5b1 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -45,7 +45,7 @@ public:
   // attributes of each function.
   const X86Subtarget *getSubtargetImpl() const = delete;
 
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   // Set up the pass pipeline.
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 3aa7187e0cd1..38925bfd51b0 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -108,8 +108,7 @@ extern "C" void LLVMInitializeXCoreTarget() {
   RegisterTargetMachine<XCoreTargetMachine> X(getTheXCoreTarget());
 }
 
-TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(XCoreTTIImpl(this, F));
-  });
+TargetTransformInfo
+XCoreTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(XCoreTTIImpl(this, F));
 }
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 5baa3524d2a6..965b9b2c4d65 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -43,7 +43,7 @@ public:
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 683655f1f68b..a9cfd8ded6fb 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -710,7 +710,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
 
 // Check if there is PGO data or user annoated branch data:
 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
-  if (F->getEntryCount())
+  if (F->hasProfileData())
     return true;
   // Now check if any of the entry block has MD_prof data:
   for (auto *E : OI->Entries) {
@@ -863,6 +863,7 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
     case Instruction::GetElementPtr:
       if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
         continue;
+      break;
     default:
       break;
     }
@@ -1273,7 +1274,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
 
   // Only try to outline cold regions if we have a profile summary, which
   // implies we have profiling information.
-  if (PSI->hasProfileSummary() && F->getEntryCount().hasValue() &&
+  if (PSI->hasProfileSummary() && F->hasProfileData() &&
       !DisableMultiRegionPartialInline) {
     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
         computeOutliningColdRegionsInfo(F);
@@ -1379,10 +1380,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
                             Cloner.ClonedFunc->user_end());
 
   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
-  if (Cloner.OrigFunc->getEntryCount())
+  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
+  if (CalleeEntryCount)
     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
 
-  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
   uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
 
   bool AnyInline = false;
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index f0e781b9d923..7086c2eb52c4 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -1583,7 +1583,10 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
 }
 
 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
-  F.setEntryCount(0);
+  // Initialize the entry count to -1, which will be treated conservatively
+  // by getEntryCount as the same as unknown (None). If we have samples this
+  // will be overwritten in emitAnnotations.
+  F.setEntryCount(-1);
   std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
   if (AM) {
     auto &FAM =
diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp
index ec56f0cde25d..5fbb001216a3 100644
--- a/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1346,6 +1346,7 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
       Constant *Bit = importConstant(Slot, CSByConstantArg.first, "bit", Int8Ty,
                                      ResByArg.Bit);
       applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit);
+      break;
     }
     default:
       break;
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index aa055121e710..a088d447337f 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4394,6 +4394,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
             cast<CallInst>(Caller)->getCallingConv());
         cast<CallInst>(NewCaller)->setAttributes(NewPAL);
       }
+      NewCaller->setDebugLoc(Caller->getDebugLoc());
 
       return NewCaller;
     }
diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 2a25423e04bd..8e2833d22032 100644
--- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -80,6 +80,11 @@ static cl::opt<bool> ClInstrumentAtomics(
     cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
     cl::init(true));
 
+static cl::opt<bool> ClRecover(
+    "hwasan-recover",
+    cl::desc("Enable recovery mode (continue-after-error)."),
+    cl::Hidden, cl::init(false));
+
 namespace {
 
 /// \brief An instrumentation pass implementing detection of addressability bugs
@@ -89,7 +94,8 @@ public:
   // Pass identification, replacement for typeid.
   static char ID;
 
-  HWAddressSanitizer() : FunctionPass(ID) {}
+  HWAddressSanitizer(bool Recover = false)
+      : FunctionPass(ID), Recover(Recover || ClRecover) {}
 
   StringRef getPassName() const override { return "HWAddressSanitizer"; }
 
@@ -109,6 +115,8 @@ private:
   LLVMContext *C;
   Type *IntptrTy;
 
+  bool Recover;
+
   Function *HwasanCtorFunction;
 
   Function *HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
@@ -126,8 +134,8 @@ INITIALIZE_PASS_END(
     HWAddressSanitizer, "hwasan",
     "HWAddressSanitizer: detect memory bugs using tagged addressing.", false, false)
 
-FunctionPass *llvm::createHWAddressSanitizerPass() {
-  return new HWAddressSanitizer();
+FunctionPass *llvm::createHWAddressSanitizerPass(bool Recover) {
+  return new HWAddressSanitizer(Recover);
 }
 
 /// \brief Module-level initialization.
@@ -156,10 +164,11 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
   for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
     const std::string TypeStr = AccessIsWrite ? "store" : "load";
+    const std::string EndingStr = Recover ? "_noabort" : "";
 
     HwasanMemoryAccessCallbackSized[AccessIsWrite] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            ClMemoryAccessCallbackPrefix + TypeStr,
+            ClMemoryAccessCallbackPrefix + TypeStr + EndingStr,
             FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false)));
 
     for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
@@ -167,7 +176,7 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) {
       HwasanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =
           checkSanitizerInterfaceFunction(M.getOrInsertFunction(
               ClMemoryAccessCallbackPrefix + TypeStr +
-                  itostr(1ULL << AccessSizeIndex),
+                  itostr(1ULL << AccessSizeIndex) + EndingStr,
               FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false)));
     }
   }
@@ -246,14 +255,16 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *PtrLong, bool IsWrite,
   Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);
 
   TerminatorInst *CheckTerm =
-      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false,
+      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover,
                                 MDBuilder(*C).createBranchWeights(1, 100000));
 
   IRB.SetInsertPoint(CheckTerm);
   // The signal handler will find the data address in x0.
   InlineAsm *Asm = InlineAsm::get(
       FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
-      "hlt #" + itostr(0x100 + IsWrite * 0x10 + AccessSizeIndex), "{x0}",
+      "hlt #" +
+          itostr(0x100 + Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex),
+      "{x0}",
       /*hasSideEffects=*/true);
   IRB.CreateCall(Asm, PtrLong);
 }
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index d8c408035038..207243231aad 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -13,10 +13,11 @@
 // threading, or IPA-CP based function cloning, etc.).
 // As of now we support two cases :
 //
-// 1) If a call site is dominated by an OR condition and if any of its arguments
-// are predicated on this OR condition, try to split the condition with more
-// constrained arguments. For example, in the code below, we try to split the
-// call site since we can predicate the argument(ptr) based on the OR condition.
+// 1) Try to a split call-site with constrained arguments, if any constraints
+// on any argument can be found by following the single predecessors of the
+// all site's predecessors. Currently this pass only handles call-sites with 2
+// predecessors. For example, in the code below, we try to split the call-site
+// since we can predicate the argument(ptr) based on the OR condition.
 //
 // Split from :
 //   if (!ptr || c)
@@ -200,16 +201,15 @@ static bool canSplitCallSite(CallSite CS) {
 }
 
 /// Return true if the CS is split into its new predecessors which are directly
-/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2.
-/// In OR predicated case, PredBB1 will point the header, and PredBB2 will point
-/// to the second compare block. CallInst1 and CallInst2 will be the new
-/// call-sites placed in the new predecessors split for PredBB1 and PredBB2,
-/// repectively. Therefore, CallInst1 will be the call-site placed
-/// between Header and Tail, and CallInst2 will be the call-site between TBB and
-/// Tail. For example, in the IR below with an OR condition, the call-site can
-/// be split
+/// hooked to each of its original predecessors pointed by PredBB1 and PredBB2.
+/// CallInst1 and CallInst2 will be the new call-sites placed in the new
+/// predecessors split for PredBB1 and PredBB2, respectively.
+/// For example, in the IR below with an OR condition, the call-site can
+/// be split. Assuming PredBB1=Header and PredBB2=TBB, CallInst1 will be the
+/// call-site placed between Header and Tail, and CallInst2 will be the
+/// call-site between TBB and Tail.
 ///
-/// from :
+/// From :
 ///
 ///   Header:
 ///     %c = icmp eq i32* %a, null
@@ -237,9 +237,9 @@ static bool canSplitCallSite(CallSite CS) {
 ///   Tail:
 ///    %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
 ///
-/// Note that for an OR predicated case, CallInst1 and CallInst2 should be
-/// created with more constrained arguments in
-/// createCallSitesOnOrPredicatedArgument().
+/// Note that in case any arguments at the call-site are constrained by its
+/// predecessors, new call-sites with more constrained arguments will be
+/// created in createCallSitesOnPredicatedArgument().
 static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
                           Instruction *CallInst1, Instruction *CallInst2) {
   Instruction *Instr = CS.getInstruction();
@@ -332,18 +332,10 @@ static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) {
   splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr);
   return true;
 }
-// Check if one of the predecessors is a single predecessors of the other.
-// This is a requirement for control flow modeling an OR. HeaderBB points to
-// the single predecessor and OrBB points to other node. HeaderBB potentially
-// contains the first compare of the OR and OrBB the second.
-static bool isOrHeader(BasicBlock *HeaderBB, BasicBlock *OrBB) {
-  return OrBB->getSinglePredecessor() == HeaderBB &&
-         HeaderBB->getTerminator()->getNumSuccessors() == 2;
-}
 
-static bool tryToSplitOnOrPredicatedArgument(CallSite CS) {
+static bool tryToSplitOnPredicatedArgument(CallSite CS) {
   auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
-  if (!isOrHeader(Preds[0], Preds[1]) && !isOrHeader(Preds[1], Preds[0]))
+  if (Preds[0] == Preds[1])
     return false;
 
   SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2;
@@ -362,7 +354,7 @@ static bool tryToSplitOnOrPredicatedArgument(CallSite CS) {
 static bool tryToSplitCallSite(CallSite CS) {
   if (!CS.arg_size() || !canSplitCallSite(CS))
     return false;
-  return tryToSplitOnOrPredicatedArgument(CS) ||
+  return tryToSplitOnPredicatedArgument(CS) ||
          tryToSplitOnPHIPredicatedArgument(CS);
 }
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 6b0377e0ecb3..1476f7850cf0 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -282,7 +282,7 @@ bool JumpThreading::runOnFunction(Function &F) {
   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
-  bool HasProfileData = F.getEntryCount().hasValue();
+  bool HasProfileData = F.hasProfileData();
   if (HasProfileData) {
     LoopInfo LI{DominatorTree(F)};
     BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
@@ -307,8 +307,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
-  bool HasProfileData = F.getEntryCount().hasValue();
-  if (HasProfileData) {
+  if (F.hasProfileData()) {
     LoopInfo LI{DominatorTree(F)};
     BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
@@ -1333,6 +1332,20 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   // code size.
   BasicBlock *UnavailablePred = nullptr;
 
+  // If the value is unavailable in one of predecessors, we will end up
+  // inserting a new instruction into them. It is only valid if all the
+  // instructions before LI are guaranteed to pass execution to its successor,
+  // or if LI is safe to speculate.
+  // TODO: If this logic becomes more complex, and we will perform PRE insertion
+  // farther than to a predecessor, we need to reuse the code from GVN's PRE.
+  // It requires domination tree analysis, so for this simple case it is an
+  // overkill.
+  if (PredsScanned.size() != AvailablePreds.size() &&
+      !isSafeToSpeculativelyExecute(LI))
+    for (auto I = LoadBB->begin(); &*I != LI; ++I)
+      if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
+        return false;
+
   // If there is exactly one predecessor where the value is unavailable, the
   // already computed 'OneUnavailablePred' block is it.  If it ends in an
   // unconditional branch, we know that it isn't a critical edge.
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index c9d55b4594fe..430a7085d93f 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -247,7 +247,7 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
 
   // Enable LoopSink only when runtime profile is available.
   // With static profile, the sinking decision may be sub-optimal.
-  if (!Preheader->getParent()->getEntryCount())
+  if (!Preheader->getParent()->hasProfileData())
     return false;
 
   const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 7b1d6446a24a..15e7da5e1a7a 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -882,7 +882,7 @@ static bool computeUnrollCount(
   }
   
   // Check if the runtime trip count is too small when profile is available.
-  if (L->getHeader()->getParent()->getEntryCount()) {
+  if (L->getHeader()->getParent()->hasProfileData()) {
     if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
       if (*ProfileTripCount < FlatLoopTripCountThreshold)
         return false;
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 9c870b42a747..6af3fef963dc 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -476,22 +476,33 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
       Alignment = DL.getABITypeAlignment(EltType);
     }
 
-    AMemSet =
-      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+    // Remember the debug location.
+    DebugLoc Loc;
+    if (!Range.TheStores.empty())
+      Loc = Range.TheStores[0]->getDebugLoc();
 
     DEBUG(dbgs() << "Replace stores:\n";
           for (Instruction *SI : Range.TheStores)
-            dbgs() << *SI << '\n';
-          dbgs() << "With: " << *AMemSet << '\n');
-
-    if (!Range.TheStores.empty())
-      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+            dbgs() << *SI << '\n');
 
     // Zap all the stores.
     for (Instruction *SI : Range.TheStores) {
       MD->removeInstruction(SI);
       SI->eraseFromParent();
     }
+
+    // Create the memset after removing the stores, so that if there any cached
+    // non-local dependencies on the removed instructions in
+    // MemoryDependenceAnalysis, the cache entries are updated to "dirty"
+    // entries pointing below the memset, so subsequent queries include the
+    // memset.
+    AMemSet =
+      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+    if (!Range.TheStores.empty())
+      AMemSet->setDebugLoc(Loc);
+
+    DEBUG(dbgs() << "With: " << *AMemSet << '\n');
+
     ++NumMemSetInfer;
   }
 
@@ -1031,9 +1042,22 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   //
   // NOTE: This is conservative, it will stop on any read from the source loc,
   // not just the defining memcpy.
-  MemDepResult SourceDep =
-      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
-                                   M->getIterator(), M->getParent());
+  MemoryLocation SourceLoc = MemoryLocation::getForSource(MDep);
+  MemDepResult SourceDep = MD->getPointerDependencyFrom(SourceLoc, false,
+                                                        M->getIterator(), M->getParent());
+
+  if (SourceDep.isNonLocal()) {
+    SmallVector<NonLocalDepResult, 2> NonLocalDepResults;
+    MD->getNonLocalPointerDependencyFrom(M, SourceLoc, /*isLoad=*/false,
+                                         NonLocalDepResults);
+    if (NonLocalDepResults.size() == 1) {
+      SourceDep = NonLocalDepResults[0].getResult();
+      assert((!SourceDep.getInst() ||
+              LookupDomTree().dominates(SourceDep.getInst(), M)) &&
+             "when memdep returns exactly one result, it should dominate");
+    }
+  }
+
   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
     return false;
 
@@ -1235,6 +1259,18 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
   MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
       SrcLoc, true, M->getIterator(), M->getParent());
 
+  if (SrcDepInfo.isNonLocal()) {
+    SmallVector<NonLocalDepResult, 2> NonLocalDepResults;
+    MD->getNonLocalPointerDependencyFrom(M, SrcLoc, /*isLoad=*/true,
+                                         NonLocalDepResults);
+    if (NonLocalDepResults.size() == 1) {
+      SrcDepInfo = NonLocalDepResults[0].getResult();
+      assert((!SrcDepInfo.getInst() ||
+              LookupDomTree().dominates(SrcDepInfo.getInst(), M)) &&
+             "when memdep returns exactly one result, it should dominate");
+    }
+  }
+
   if (SrcDepInfo.isClobber()) {
     if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
       return processMemCpyMemCpyDependence(M, MDep);
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index e5866b4718da..66608ec631f6 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -1929,9 +1929,32 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
         if (!I) continue;
 
         bool Folded = ConstantFoldTerminator(I->getParent());
-        assert(Folded &&
-              "Expect TermInst on constantint or blockaddress to be folded");
-        (void) Folded;
+        if (!Folded) {
+          // The constant folder may not have been able to fold the terminator
+          // if this is a branch or switch on undef.  Fold it manually as a
+          // branch to the first successor.
+#ifndef NDEBUG
+          if (auto *BI = dyn_cast<BranchInst>(I)) {
+            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
+                   "Branch should be foldable!");
+          } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
+            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+          } else {
+            llvm_unreachable("Didn't fold away reference to block!");
+          }
+#endif
+
+          // Make this an uncond branch to the first successor.
+          TerminatorInst *TI = I->getParent()->getTerminator();
+          BranchInst::Create(TI->getSuccessor(0), TI);
+
+          // Remove entries in successor phi nodes to remove edges.
+          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
+            TI->getSuccessor(i)->removePredecessor(TI->getParent());
+
+          // Remove the old terminator.
+          TI->eraseFromParent();
+        }
       }
 
       // Finally, delete the basic block.
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 209821ff21d7..8fa9ffb6d014 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -97,7 +97,7 @@
 //    load %p2
 //    ...
 //
-// We can not do CSE for to the common part related to index "i64 %i". Lowering
+// We can not do CSE to the common part related to index "i64 %i". Lowering
 // GEPs can achieve such goals.
 // If the target does not use alias analysis in codegen, this pass will
 // lower a GEP with multiple indices into arithmetic operations:
diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp
index eb3139ce4293..8825f77555e7 100644
--- a/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -23,10 +23,30 @@ using namespace llvm;
 /// Fix-up phi nodes in an invoke instruction's normal destination.
 ///
 /// After versioning an invoke instruction, values coming from the original
-/// block will now either be coming from the original block or the "else" block.
+/// block will now be coming from the "merge" block. For example, in the code
+/// below:
+///
+///   then_bb:
+///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   else_bb:
+///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   merge_bb:
+///     %t2 = phi i32 [ %t0, %then_bb ], [ %t1, %else_bb ]
+///     br %normal_dst
+///
+///   normal_dst:
+///     %t3 = phi i32 [ %x, %orig_bb ], ...
+///
+/// "orig_bb" is no longer a predecessor of "normal_dst", so the phi nodes in
+/// "normal_dst" must be fixed to refer to "merge_bb":
+///
+///    normal_dst:
+///      %t3 = phi i32 [ %x, %merge_bb ], ...
+///
 static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
-                                      BasicBlock *ElseBlock,
-                                      Instruction *NewInst) {
+                                      BasicBlock *MergeBlock) {
   for (auto &I : *Invoke->getNormalDest()) {
     auto *Phi = dyn_cast<PHINode>(&I);
     if (!Phi)
@@ -34,13 +54,7 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
     int Idx = Phi->getBasicBlockIndex(OrigBlock);
     if (Idx == -1)
       continue;
-    Value *V = Phi->getIncomingValue(Idx);
-    if (dyn_cast<Instruction>(V) == Invoke) {
-      Phi->setIncomingBlock(Idx, ElseBlock);
-      Phi->addIncoming(NewInst, OrigBlock);
-      continue;
-    }
-    Phi->addIncoming(V, ElseBlock);
+    Phi->setIncomingBlock(Idx, MergeBlock);
   }
 }
 
@@ -48,6 +62,23 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
 ///
 /// After versioning an invoke instruction, values coming from the original
 /// block will now be coming from either the "then" block or the "else" block.
+/// For example, in the code below:
+///
+///   then_bb:
+///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   else_bb:
+///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   unwind_dst:
+///     %t3 = phi i32 [ %x, %orig_bb ], ...
+///
+/// "orig_bb" is no longer a predecessor of "unwind_dst", so the phi nodes in
+/// "unwind_dst" must be fixed to refer to "then_bb" and "else_bb":
+///
+///   unwind_dst:
+///     %t3 = phi i32 [ %x, %then_bb ], [ %x, %else_bb ], ...
+///
 static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
                                       BasicBlock *ThenBlock,
                                       BasicBlock *ElseBlock) {
@@ -64,44 +95,26 @@ static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
   }
 }
 
-/// Get the phi node having the returned value of a call or invoke instruction
-/// as it's operand.
-static bool getRetPhiNode(Instruction *Inst, BasicBlock *Block) {
-  BasicBlock *FromBlock = Inst->getParent();
-  for (auto &I : *Block) {
-    PHINode *PHI = dyn_cast<PHINode>(&I);
-    if (!PHI)
-      break;
-    int Idx = PHI->getBasicBlockIndex(FromBlock);
-    if (Idx == -1)
-      continue;
-    auto *V = PHI->getIncomingValue(Idx);
-    if (V == Inst)
-      return true;
-  }
-  return false;
-}
-
 /// Create a phi node for the returned value of a call or invoke instruction.
 ///
 /// After versioning a call or invoke instruction that returns a value, we have
 /// to merge the value of the original and new instructions. We do this by
 /// creating a phi node and replacing uses of the original instruction with this
 /// phi node.
-static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) {
+///
+/// For example, if \p OrigInst is defined in "else_bb" and \p NewInst is
+/// defined in "then_bb", we create the following phi node:
+///
+///   ; Uses of the original instruction are replaced by uses of the phi node.
+///   %t0 = phi i32 [ %orig_inst, %else_bb ], [ %new_inst, %then_bb ],
+///
+static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst,
+                             BasicBlock *MergeBlock, IRBuilder<> &Builder) {
 
   if (OrigInst->getType()->isVoidTy() || OrigInst->use_empty())
     return;
 
-  BasicBlock *RetValBB = NewInst->getParent();
-  if (auto *Invoke = dyn_cast<InvokeInst>(NewInst))
-    RetValBB = Invoke->getNormalDest();
-  BasicBlock *PhiBB = RetValBB->getSingleSuccessor();
-
-  if (getRetPhiNode(OrigInst, PhiBB))
-    return;
-
-  IRBuilder<> Builder(&PhiBB->front());
+  Builder.SetInsertPoint(&MergeBlock->front());
   PHINode *Phi = Builder.CreatePHI(OrigInst->getType(), 0);
   SmallVector<User *, 16> UsersToUpdate;
   for (User *U : OrigInst->users())
@@ -109,7 +122,7 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) {
   for (User *U : UsersToUpdate)
     U->replaceUsesOfWith(OrigInst, Phi);
   Phi->addIncoming(OrigInst, OrigInst->getParent());
-  Phi->addIncoming(NewInst, RetValBB);
+  Phi->addIncoming(NewInst, NewInst->getParent());
 }
 
 /// Cast a call or invoke instruction to the given type.
@@ -118,7 +131,41 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) {
 /// that of the callee. If this is the case, we have to cast the returned value
 /// to the correct type. The location of the cast depends on if we have a call
 /// or invoke instruction.
-Instruction *createRetBitCast(CallSite CS, Type *RetTy) {
+///
+/// For example, if the call instruction below requires a bitcast after
+/// promotion:
+///
+///   orig_bb:
+///     %t0 = call i32 @func()
+///     ...
+///
+/// The bitcast is placed after the call instruction:
+///
+///   orig_bb:
+///     ; Uses of the original return value are replaced by uses of the bitcast.
+///     %t0 = call i32 @func()
+///     %t1 = bitcast i32 %t0 to ...
+///     ...
+///
+/// A similar transformation is performed for invoke instructions. However,
+/// since invokes are terminating, a new block is created for the bitcast. For
+/// example, if the invoke instruction below requires a bitcast after promotion:
+///
+///   orig_bb:
+///     %t0 = invoke i32 @func() to label %normal_dst unwind label %unwind_dst
+///
+/// The edge between the original block and the invoke's normal destination is
+/// split, and the bitcast is placed there:
+///
+///   orig_bb:
+///     %t0 = invoke i32 @func() to label %split_bb unwind label %unwind_dst
+///
+///   split_bb:
+///     ; Uses of the original return value are replaced by uses of the bitcast.
+///     %t1 = bitcast i32 %t0 to ...
+///     br label %normal_dst
+///
+static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) {
 
   // Save the users of the calling instruction. These uses will be changed to
   // use the bitcast after we create it.
@@ -130,19 +177,20 @@ Instruction *createRetBitCast(CallSite CS, Type *RetTy) {
   // value. The location depends on if we have a call or invoke instruction.
   Instruction *InsertBefore = nullptr;
   if (auto *Invoke = dyn_cast<InvokeInst>(CS.getInstruction()))
-    InsertBefore = &*Invoke->getNormalDest()->getFirstInsertionPt();
+    InsertBefore =
+        &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front();
   else
     InsertBefore = &*std::next(CS.getInstruction()->getIterator());
 
   // Bitcast the return value to the correct type.
   auto *Cast = CastInst::Create(Instruction::BitCast, CS.getInstruction(),
                                 RetTy, "", InsertBefore);
+  if (RetBitCast)
+    *RetBitCast = Cast;
 
   // Replace all the original uses of the calling instruction with the bitcast.
   for (User *U : UsersToUpdate)
     U->replaceUsesOfWith(CS.getInstruction(), Cast);
-
-  return Cast;
 }
 
 /// Predicate and clone the given call site.
@@ -152,21 +200,78 @@ Instruction *createRetBitCast(CallSite CS, Type *RetTy) {
 /// callee. The original call site is moved into the "else" block, and a clone
 /// of the call site is placed in the "then" block. The cloned instruction is
 /// returned.
+///
+/// For example, the call instruction below:
+///
+///   orig_bb:
+///     %t0 = call i32 %ptr()
+///     ...
+///
+/// Is replace by the following:
+///
+///   orig_bb:
+///     %cond = icmp eq i32 ()* %ptr, @func
+///     br i1 %cond, %then_bb, %else_bb
+///
+///   then_bb:
+///     ; The clone of the original call instruction is placed in the "then"
+///     ; block. It is not yet promoted.
+///     %t1 = call i32 %ptr()
+///     br merge_bb
+///
+///   else_bb:
+///     ; The original call instruction is moved to the "else" block.
+///     %t0 = call i32 %ptr()
+///     br merge_bb
+///
+///   merge_bb:
+///     ; Uses of the original call instruction are replaced by uses of the phi
+///     ; node.
+///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
+///     ...
+///
+/// A similar transformation is performed for invoke instructions. However,
+/// since invokes are terminating, more work is required. For example, the
+/// invoke instruction below:
+///
+///   orig_bb:
+///     %t0 = invoke %ptr() to label %normal_dst unwind label %unwind_dst
+///
+/// Is replace by the following:
+///
+///   orig_bb:
+///     %cond = icmp eq i32 ()* %ptr, @func
+///     br i1 %cond, %then_bb, %else_bb
+///
+///   then_bb:
+///     ; The clone of the original invoke instruction is placed in the "then"
+///     ; block, and its normal destination is set to the "merge" block. It is
+///     ; not yet promoted.
+///     %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   else_bb:
+///     ; The original invoke instruction is moved into the "else" block, and
+///     ; its normal destination is set to the "merge" block.
+///     %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+///   merge_bb:
+///     ; Uses of the original invoke instruction are replaced by uses of the
+///     ; phi node, and the merge block branches to the normal destination.
+///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
+///     br %normal_dst
+///
 static Instruction *versionCallSite(CallSite CS, Value *Callee,
-                                    MDNode *BranchWeights,
-                                    BasicBlock *&ThenBlock,
-                                    BasicBlock *&ElseBlock,
-                                    BasicBlock *&MergeBlock) {
+                                    MDNode *BranchWeights) {
 
   IRBuilder<> Builder(CS.getInstruction());
   Instruction *OrigInst = CS.getInstruction();
+  BasicBlock *OrigBlock = OrigInst->getParent();
 
   // Create the compare. The called value and callee must have the same type to
   // be compared.
-  auto *LHS =
-      Builder.CreateBitCast(CS.getCalledValue(), Builder.getInt8PtrTy());
-  auto *RHS = Builder.CreateBitCast(Callee, Builder.getInt8PtrTy());
-  auto *Cond = Builder.CreateICmpEQ(LHS, RHS);
+  if (CS.getCalledValue()->getType() != Callee->getType())
+    Callee = Builder.CreateBitCast(Callee, CS.getCalledValue()->getType());
+  auto *Cond = Builder.CreateICmpEQ(CS.getCalledValue(), Callee);
 
   // Create an if-then-else structure. The original instruction is moved into
   // the "else" block, and a clone of the original instruction is placed in the
@@ -175,9 +280,9 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,
   TerminatorInst *ElseTerm = nullptr;
   SplitBlockAndInsertIfThenElse(Cond, CS.getInstruction(), &ThenTerm, &ElseTerm,
                                 BranchWeights);
-  ThenBlock = ThenTerm->getParent();
-  ElseBlock = ElseTerm->getParent();
-  MergeBlock = OrigInst->getParent();
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  BasicBlock *ElseBlock = ElseTerm->getParent();
+  BasicBlock *MergeBlock = OrigInst->getParent();
 
   ThenBlock->setName("if.true.direct_targ");
   ElseBlock->setName("if.false.orig_indirect");
@@ -188,7 +293,8 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,
   NewInst->insertBefore(ThenTerm);
 
   // If the original call site is an invoke instruction, we have extra work to
-  // do since invoke instructions are terminating.
+  // do since invoke instructions are terminating. We have to fix-up phi nodes
+  // in the invoke's normal and unwind destinations.
   if (auto *OrigInvoke = dyn_cast<InvokeInst>(OrigInst)) {
     auto *NewInvoke = cast<InvokeInst>(NewInst);
 
@@ -201,11 +307,19 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,
     Builder.SetInsertPoint(MergeBlock);
     Builder.CreateBr(OrigInvoke->getNormalDest());
 
-    // Now set the normal destination of new the invoke instruction to be the
+    // Fix-up phi nodes in the original invoke's normal and unwind destinations.
+    fixupPHINodeForNormalDest(OrigInvoke, OrigBlock, MergeBlock);
+    fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock);
+
+    // Now set the normal destinations of the invoke instructions to be the
     // "merge" block.
+    OrigInvoke->setNormalDest(MergeBlock);
     NewInvoke->setNormalDest(MergeBlock);
   }
 
+  // Create a phi node for the returned value of the call site.
+  createRetPHINode(OrigInst, NewInst, MergeBlock, Builder);
+
   return NewInst;
 }
 
@@ -253,7 +367,8 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee,
   return true;
 }
 
-static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) {
+Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
+                               CastInst **RetBitCast) {
   assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted");
 
   // Set the called function of the call site to be the given callee.
@@ -268,7 +383,7 @@ static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) {
   // If the function type of the call site matches that of the callee, no
   // additional work is required.
   if (CS.getFunctionType() == Callee->getFunctionType())
-    return;
+    return CS.getInstruction();
 
   // Save the return types of the call site and callee.
   Type *CallSiteRetTy = CS.getInstruction()->getType();
@@ -294,7 +409,9 @@ static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) {
   // If the return type of the call site doesn't match that of the callee, cast
   // the returned value to the appropriate type.
   if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy)
-    Cast = createRetBitCast(CS, CallSiteRetTy);
+    createRetBitCast(CS, CallSiteRetTy, RetBitCast);
+
+  return CS.getInstruction();
 }
 
 Instruction *llvm::promoteCallWithIfThenElse(CallSite CS, Function *Callee,
@@ -303,26 +420,10 @@ Instruction *llvm::promoteCallWithIfThenElse(CallSite CS, Function *Callee,
   // Version the indirect call site. If the called value is equal to the given
   // callee, 'NewInst' will be executed, otherwise the original call site will
   // be executed.
-  BasicBlock *ThenBlock, *ElseBlock, *MergeBlock;
-  Instruction *NewInst = versionCallSite(CS, Callee, BranchWeights, ThenBlock,
-                                         ElseBlock, MergeBlock);
+  Instruction *NewInst = versionCallSite(CS, Callee, BranchWeights);
 
   // Promote 'NewInst' so that it directly calls the desired function.
-  Instruction *Cast = NewInst;
-  promoteCall(CallSite(NewInst), Callee, Cast);
-
-  // If the original call site is an invoke instruction, we have to fix-up phi
-  // nodes in the invoke's normal and unwind destinations.
-  if (auto *OrigInvoke = dyn_cast<InvokeInst>(CS.getInstruction())) {
-    fixupPHINodeForNormalDest(OrigInvoke, MergeBlock, ElseBlock, Cast);
-    fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock);
-  }
-
-  // Create a phi node for the returned value of the call site.
-  createRetPHINode(CS.getInstruction(), Cast ? Cast : NewInst);
-
-  // Return the new direct call.
-  return NewInst;
+  return promoteCall(CallSite(NewInst), Callee);
 }
 
 #undef DEBUG_TYPE
diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp
index 4273ce0b6200..c84ae7d693d7 100644
--- a/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -203,7 +203,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   // hit the peeled section.
   // We only do this in the presence of profile information, since otherwise
   // our estimates of the trip count are not reliable enough.
-  if (UP.AllowPeeling && L->getHeader()->getParent()->getEntryCount()) {
+  if (UP.AllowPeeling && L->getHeader()->getParent()->hasProfileData()) {
     Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
     if (!PeelCount)
       return;
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index f02f80cc1b78..b3c80424c8b9 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -127,6 +127,16 @@ static cl::opt<unsigned> MaxSpeculationDepth(
     cl::desc("Limit maximum recursion depth when calculating costs of "
              "speculatively executed instructions"));
 
+static cl::opt<unsigned> DependenceChainLatency(
+    "dependence-chain-latency", cl::Hidden, cl::init(8),
+    cl::desc("Limit the maximum latency of dependence chain containing cmp "
+             "for if conversion"));
+
+static cl::opt<unsigned> SmallBBSize(
+    "small-bb-size", cl::Hidden, cl::init(40),
+    cl::desc("Check dependence chain latency only in basic block smaller than "
+             "this number"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -395,6 +405,166 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   return true;
 }
 
+/// Estimate the code size of the specified BB.
+static unsigned CountBBCodeSize(BasicBlock *BB,
+                                const TargetTransformInfo &TTI) {
+  unsigned Size = 0;
+  for (auto II = BB->begin(); !isa<TerminatorInst>(II); ++II)
+    Size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize);
+  return Size;
+}
+
+/// Find out the latency of the longest dependence chain in the BB if
+/// LongestChain is true, or the dependence chain containing the compare
+/// instruction feeding the block's conditional branch.
+static unsigned FindDependenceChainLatency(BasicBlock *BB,
+                            DenseMap<Instruction *, unsigned> &Instructions,
+                            const TargetTransformInfo &TTI,
+                            bool LongestChain) {
+  unsigned MaxLatency = 0;
+
+  BasicBlock::iterator II;
+  for (II = BB->begin(); !isa<TerminatorInst>(II); ++II) {
+    unsigned Latency = 0;
+    for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) {
+      Instruction *Op = dyn_cast<Instruction>(II->getOperand(O));
+      if (Op && Instructions.count(Op)) {
+        auto OpLatency = Instructions[Op];
+        if (OpLatency > Latency)
+          Latency = OpLatency;
+      }
+    }
+    Latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency);
+    Instructions[&(*II)] = Latency;
+
+    if (Latency > MaxLatency)
+      MaxLatency = Latency;
+  }
+
+  if (LongestChain)
+    return MaxLatency;
+
+  // The length of the dependence chain containing the compare instruction is
+  // wanted, so the terminator must be a BranchInst.
+  assert(isa<BranchInst>(II));
+  BranchInst* Br = cast<BranchInst>(II);
+  Instruction *Cmp = dyn_cast<Instruction>(Br->getCondition());
+  if (Cmp && Instructions.count(Cmp))
+    return Instructions[Cmp];
+  else
+    return 0;
+}
+
+/// Instructions in BB2 may depend on instructions in BB1, and instructions
+/// in BB1 may have users in BB2. If the last (in terms of latency) such kind
+/// of instruction in BB1 is I, then the instructions after I can be executed
+/// in parallel with instructions in BB2.
+/// This function returns the latency of I.
+static unsigned LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2,
+                        BasicBlock *IfBlock1, BasicBlock *IfBlock2,
+                        DenseMap<Instruction *, unsigned> &BB1Instructions) {
+  unsigned LastLatency = 0;
+  SmallVector<Instruction *, 16> Worklist;
+  BasicBlock::iterator II;
+  for (II = BB2->begin(); !isa<TerminatorInst>(II); ++II) {
+    if (PHINode *PN = dyn_cast<PHINode>(II)) {
+      // Look for users in BB2.
+      bool InBBUser = false;
+      for (User *U : PN->users()) {
+        if (cast<Instruction>(U)->getParent() == BB2) {
+          InBBUser = true;
+          break;
+        }
+      }
+      // No such user, we don't care about this instruction and its operands.
+      if (!InBBUser)
+        break;
+    }
+    Worklist.push_back(&(*II));
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) {
+      if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(O))) {
+        if (Op->getParent() == IfBlock1 || Op->getParent() == IfBlock2)
+          Worklist.push_back(Op);
+        else if (Op->getParent() == BB1 && BB1Instructions.count(Op)) {
+          if (BB1Instructions[Op] > LastLatency)
+            LastLatency = BB1Instructions[Op];
+        }
+      }
+    }
+  }
+
+  return LastLatency;
+}
+
+/// If after if conversion, most of the instructions in this new BB construct a
+/// long and slow dependence chain, it may be slower than cmp/branch, even
+/// if the branch has a high miss rate, because the control dependence is
+/// transformed into data dependence, and control dependence can be speculated,
+/// and thus, the second part can execute in parallel with the first part on
+/// modern OOO processor.
+///
+/// To check this condition, this function finds the length of the dependence
+/// chain in BB1 (only the part that can be executed in parallel with code after
+/// branch in BB2) containing cmp, and if the length is longer than a threshold,
+/// don't perform if conversion.
+///
+/// BB1, BB2, IfBlock1 and IfBlock2 are candidate BBs for if conversion.
+/// SpeculationSize contains the code size of IfBlock1 and IfBlock2.
+static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2,
+                             BasicBlock *IfBlock1, BasicBlock *IfBlock2,
+                             unsigned SpeculationSize,
+                             const TargetTransformInfo &TTI) {
+  // Accumulated latency of each instruction in their BBs.
+  DenseMap<Instruction *, unsigned> BB1Instructions;
+  DenseMap<Instruction *, unsigned> BB2Instructions;
+
+  if (!TTI.isOutOfOrder())
+    return false;
+
+  unsigned NewBBSize = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI)
+                         + SpeculationSize;
+
+  // We check small BB only since it is more difficult to find unrelated
+  // instructions to fill functional units in a small BB.
+  if (NewBBSize > SmallBBSize)
+    return false;
+
+  auto BB1Chain =
+         FindDependenceChainLatency(BB1, BB1Instructions, TTI, false);
+  auto BB2Chain =
+         FindDependenceChainLatency(BB2, BB2Instructions, TTI, true);
+
+  // If there are many unrelated instructions in the new BB, there will be
+  // other instructions for the processor to issue regardless of the length
+  // of this new dependence chain.
+  // Modern processors can issue 3 or more instructions in each cycle. But in
+  // real world applications, an IPC of 2 is already very good for non-loop
+  // code with small basic blocks. Higher IPC is usually found in programs with
+  // small kernel. So IPC of 2 is more reasonable for most applications.
+  if ((BB1Chain + BB2Chain) * 2 <= NewBBSize)
+    return false;
+
+  // We only care about part of the dependence chain in BB1 that can be
+  // executed in parallel with BB2, so adjust the latency.
+  BB1Chain -=
+      LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1Instructions);
+
+  // Correctly predicted branch instruction can skip the dependence chain in
+  // BB1, but misprediction has a penalty, so only when the dependence chain is
+  // longer than DependenceChainLatency, then branch is better than select.
+  // Besides misprediction penalty, the threshold value DependenceChainLatency
+  // also depends on branch misprediction rate, taken branch latency and cmov
+  // latency.
+  if (BB1Chain >= DependenceChainLatency)
+    return true;
+
+  return false;
+}
+
 /// Extract ConstantInt from value, looking through IntToPtr
 /// and PointerNullValue. Return NULL if value is not a constant int.
 static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
@@ -1654,14 +1824,11 @@ namespace {
 
 } // end anonymous namespace
 
-/// Given an unconditional branch that goes to BBEnd,
-/// check whether BBEnd has only two predecessors and the other predecessor
-/// ends with an unconditional branch. If it is true, sink any common code
-/// in the two predecessors to BBEnd.
-static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
-  assert(BI1->isUnconditional());
-  BasicBlock *BBEnd = BI1->getSuccessor(0);
-
+/// Check whether BB's predecessors end with unconditional branches. If it is
+/// true, sink any common code from the predecessors to BB.
+/// We also allow one predecessor to end with conditional branch (but no more
+/// than one).
+static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
   // We support two situations:
   //   (1) all incoming arcs are unconditional
   //   (2) one incoming arc is conditional
@@ -1705,7 +1872,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   //
   SmallVector<BasicBlock*,4> UnconditionalPreds;
   Instruction *Cond = nullptr;
-  for (auto *B : predecessors(BBEnd)) {
+  for (auto *B : predecessors(BB)) {
     auto *T = B->getTerminator();
     if (isa<BranchInst>(T) && cast<BranchInst>(T)->isUnconditional())
       UnconditionalPreds.push_back(B);
@@ -1773,8 +1940,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     DEBUG(dbgs() << "SINK: Splitting edge\n");
     // We have a conditional edge and we're going to sink some instructions.
     // Insert a new block postdominating all blocks we're going to sink from.
-    if (!SplitBlockPredecessors(BI1->getSuccessor(0), UnconditionalPreds,
-                                ".sink.split"))
+    if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split"))
       // Edges couldn't be split.
       return false;
     Changed = true;
@@ -2048,6 +2214,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
   if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))
     return false;
 
+  // Don't do if conversion for long dependence chain.
+  if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr,
+                              CountBBCodeSize(ThenBB, TTI), TTI))
+    return false;
+
   // If we get here, we can hoist the instruction and if-convert.
   DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
@@ -2355,6 +2526,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
       }
   }
 
+  if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2,
+                              AggressiveInsts.size(), TTI))
+    return false;
+
   DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
                << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
 
@@ -5728,9 +5903,6 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
   BasicBlock *BB = BI->getParent();
   BasicBlock *Succ = BI->getSuccessor(0);
 
-  if (SinkCommon && Options.SinkCommonInsts && SinkThenElseCodeToEnd(BI))
-    return true;
-
   // If the Terminator is the only non-phi instruction, simplify the block.
   // If LoopHeader is provided, check if the block or its successor is a loop
   // header. (This is for early invocations before loop simplify and
@@ -6008,6 +6180,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   if (MergeBlockIntoPredecessor(BB))
     return true;
 
+  if (SinkCommon && Options.SinkCommonInsts)
+    Changed |= SinkCommonCodeFromPredecessors(BB);
+
   IRBuilder<> Builder(BB);
 
   // If there is a trivial two-entry PHI node in this basic block, and we can
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index fbcdc0df0f1c..52f32cda2609 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5049,13 +5049,13 @@ bool LoopVectorizationLegality::canVectorize() {
   bool Result = true;
   
   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
-  if (DoExtraAnalysis)
   // We must have a loop in canonical form. Loops with indirectbr in them cannot
   // be canonicalized.
   if (!TheLoop->getLoopPreheader()) {
+    DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
-  if (DoExtraAnalysis)
+    if (DoExtraAnalysis)
       Result = false;
     else
       return false;
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 76ba62f5d596..a7ccd3faec44 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -646,23 +646,17 @@ private:
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int UserIndx = -1,
-                     int OpdNum = 0);
+  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
 
   /// \returns True if the ExtractElement/ExtractValue instructions in VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
   bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
 
-  /// Vectorize a single entry in the tree.\p OpdNum indicate the ordinality of
-  /// operand corrsponding to this tree entry \p E for the user tree entry
-  /// indicated by \p UserIndx.
-  //  In other words, "E == TreeEntry[UserIndx].getOperand(OpdNum)".
-  Value *vectorizeTree(TreeEntry *E, int OpdNum = 0, int UserIndx = -1);
+  /// Vectorize a single entry in the tree.
+  Value *vectorizeTree(TreeEntry *E);
 
-  /// Vectorize a single entry in the tree, starting in \p VL.\p OpdNum indicate
-  /// the ordinality of operand corrsponding to the \p VL of scalar values for the
-  /// user indicated by \p UserIndx this \p VL feeds into.
-  Value *vectorizeTree(ArrayRef<Value *> VL, int OpdNum = 0, int UserIndx = -1);
+  /// Vectorize a single entry in the tree, starting in \p VL.
+  Value *vectorizeTree(ArrayRef<Value *> VL);
 
   /// \returns the pointer to the vectorized value if \p VL is already
   /// vectorized, or NULL. They may happen in cycles.
@@ -708,16 +702,6 @@ private:
       return std::equal(VL.begin(), VL.end(), Scalars.begin());
     }
 
-    /// \returns true if the scalars in VL are found in this tree entry.
-    bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL,
-        ScalarEvolution &SE) const {
-      assert(VL.size() == Scalars.size() && "Invalid size");
-      SmallVector<Value *, 8> List;
-      if (!sortLoadAccesses(VL, DL, SE, List))
-        return false;
-      return std::equal(List.begin(), List.end(), Scalars.begin());
-    }
-
     /// A vector of scalars.
     ValueList Scalars;
 
@@ -727,14 +711,6 @@ private:
     /// Do we need to gather this sequence ?
     bool NeedToGather = false;
 
-    /// Records optional shuffle mask for the uses of jumbled memory accesses.
-    /// For example, a non-empty ShuffleMask[1] represents the permutation of
-    /// lanes that operand #1 of this vectorized instruction should undergo
-    /// before feeding this vectorized instruction, whereas an empty
-    /// ShuffleMask[0] indicates that the lanes of operand #0 of this vectorized
-    /// instruction need not be permuted at all.
-    SmallVector<SmallVector<unsigned, 4>, 2> ShuffleMask;
-
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -750,31 +726,12 @@ private:
 
   /// Create a new VectorizableTree entry.
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
-                          int &UserTreeIdx, const InstructionsState &S,
-                          ArrayRef<unsigned> ShuffleMask = None,
-                          int OpdNum = 0) {
-    assert((!Vectorized || S.Opcode != 0) &&
-           "Vectorized TreeEntry without opcode");
+                          int &UserTreeIdx) {
     VectorizableTree.emplace_back(VectorizableTree);
-
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
-
-    TreeEntry *UserTreeEntry = nullptr;
-    if (UserTreeIdx != -1)
-      UserTreeEntry = &VectorizableTree[UserTreeIdx];
-
-    if (UserTreeEntry && !ShuffleMask.empty()) {
-      if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size())
-        UserTreeEntry->ShuffleMask.resize(OpdNum + 1);
-      assert(UserTreeEntry->ShuffleMask[OpdNum].empty() &&
-             "Mask already present");
-      using mask = SmallVector<unsigned, 4>;
-      mask tempMask(ShuffleMask.begin(), ShuffleMask.end());
-      UserTreeEntry->ShuffleMask[OpdNum] = tempMask;
-    }
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -1427,34 +1384,34 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
 }
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
-                            int UserTreeIdx, int OpdNum) {
+                            int UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, UserTreeIdx, S);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, UserTreeIdx, S);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, UserTreeIdx, S);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, UserTreeIdx, S);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
@@ -1466,7 +1423,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (EphValues.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is ephemeral.\n");
-      newTreeEntry(VL, false, UserTreeIdx, S);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1477,7 +1434,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
       if (E->Scalars[i] != VL[i]) {
         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        newTreeEntry(VL, false, UserTreeIdx, S);
+        newTreeEntry(VL, false, UserTreeIdx);
         return;
       }
     }
@@ -1496,7 +1453,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (getTreeEntry(I)) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is already in tree.\n");
-      newTreeEntry(VL, false, UserTreeIdx, S);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1506,7 +1463,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, UserTreeIdx, S);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1520,7 +1477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, UserTreeIdx, S);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
@@ -1529,7 +1486,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     for (unsigned j = i + 1; j < e; ++j)
       if (VL[i] == VL[j]) {
         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, false, UserTreeIdx, S);
+        newTreeEntry(VL, false, UserTreeIdx);
         return;
       }
 
@@ -1544,7 +1501,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx, S);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1563,12 +1520,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           if (Term) {
             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, S);
+            newTreeEntry(VL, false, UserTreeIdx);
             return;
           }
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1578,7 +1535,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1590,7 +1547,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       } else {
         BS.cancelScheduling(VL, VL0);
       }
-      newTreeEntry(VL, Reuse, UserTreeIdx, S);
+      newTreeEntry(VL, Reuse, UserTreeIdx);
       return;
     }
     case Instruction::Load: {
@@ -1605,7 +1562,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, S);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1616,13 +1573,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         LoadInst *L = cast<LoadInst>(VL[i]);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
       }
 
       // Check if the loads are consecutive, reversed, or neither.
+      // TODO: What we really want is to sort the loads, but for now, check
+      // the two likely directions.
       bool Consecutive = true;
       bool ReverseConsecutive = true;
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
@@ -1636,7 +1595,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       if (Consecutive) {
         ++NumLoadsWantToKeepOrder;
-        newTreeEntry(VL, true, UserTreeIdx, S);
+        newTreeEntry(VL, true, UserTreeIdx);
         DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         return;
       }
@@ -1650,41 +1609,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             break;
           }
 
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, false, UserTreeIdx);
+
       if (ReverseConsecutive) {
-        DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
         ++NumLoadsWantToChangeOrder;
-        BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, S);
-        return;
-      }
-
-      if (VL.size() > 2) {
-        bool ShuffledLoads = true;
-        SmallVector<Value *, 8> Sorted;
-        SmallVector<unsigned, 4> Mask;
-        if (sortLoadAccesses(VL, *DL, *SE, Sorted, &Mask)) {
-          auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end());
-          for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
-            if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) {
-              ShuffledLoads = false;
-              break;
-            }
-          }
-          // TODO: Tracking how many load wants to have arbitrary shuffled order
-          // would be usefull.
-          if (ShuffledLoads) {
-            DEBUG(dbgs() << "SLP: added a vector of loads which needs "
-                            "permutation of loaded lanes.\n");
-            newTreeEntry(NewVL, true, UserTreeIdx, S,
-                         makeArrayRef(Mask.begin(), Mask.end()), OpdNum);
-            return;
-          }
-        }
+        DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
+      } else {
+        DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
       }
-
-      DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
     case Instruction::ZExt:
@@ -1704,12 +1637,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1718,7 +1651,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1732,13 +1665,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1747,7 +1680,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1770,7 +1703,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1779,7 +1712,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1789,7 +1722,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
@@ -1799,7 +1732,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
@@ -1812,7 +1745,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (Ty0 != CurTy) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
@@ -1824,12 +1757,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           DEBUG(
               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1837,7 +1770,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1846,12 +1779,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
@@ -1869,7 +1802,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, S);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1883,7 +1816,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                        << "\n");
           return;
@@ -1894,7 +1827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx, S);
+            newTreeEntry(VL, false, UserTreeIdx);
             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                          << " argument "<< A1I<<"!=" << A1J
                          << "\n");
@@ -1907,14 +1840,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx, S);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
                        << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1922,7 +1855,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           CallInst *CI2 = dyn_cast<CallInst>(j);
           Operands.push_back(CI2->getArgOperand(i));
         }
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1931,11 +1864,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // then do not vectorize this instruction.
       if (!S.IsAltShuffle) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx, S);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, UserTreeIdx, S);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
@@ -1943,7 +1876,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         ValueList Left, Right;
         reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1953,13 +1886,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
 
     default:
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx, S);
+      newTreeEntry(VL, false, UserTreeIdx);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -2797,20 +2730,12 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
   return nullptr;
 }
 
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) {
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   InstructionsState S = getSameOpcode(VL);
   if (S.Opcode) {
     if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-      TreeEntry *UserTreeEntry = nullptr;
-      if (UserIndx != -1)
-        UserTreeEntry = &VectorizableTree[UserIndx];
-
-      if (E->isSame(VL) ||
-          (UserTreeEntry &&
-           (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
-           !UserTreeEntry->ShuffleMask[OpdNum].empty() &&
-           E->isFoundJumbled(VL, *DL, *SE)))
-        return vectorizeTree(E, OpdNum, UserIndx);
+      if (E->isSame(VL))
+        return vectorizeTree(E);
     }
   }
 
@@ -2822,10 +2747,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) {
   return Gather(VL, VecTy);
 }
 
-Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
-  TreeEntry *UserTreeEntry = nullptr;
   if (E->VectorizedValue) {
     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
     return E->VectorizedValue;
@@ -2845,10 +2769,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
     return V;
   }
 
-  assert(ScalarToTreeEntry.count(E->Scalars[0]) &&
-         "Expected user tree entry, missing!");
-  int CurrIndx = ScalarToTreeEntry[E->Scalars[0]];
-
   unsigned ShuffleOrOp = S.IsAltShuffle ?
            (unsigned) Instruction::ShuffleVector : S.Opcode;
   switch (ShuffleOrOp) {
@@ -2878,7 +2798,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
         Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
-        Value *Vec = vectorizeTree(Operands, i, CurrIndx);
+        Value *Vec = vectorizeTree(Operands);
         NewPhi->addIncoming(Vec, IBB);
       }
 
@@ -2931,7 +2851,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *InVec = vectorizeTree(INVL, 0, CurrIndx);
+      Value *InVec = vectorizeTree(INVL);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -2952,8 +2872,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *L = vectorizeTree(LHSV, 0, CurrIndx);
-      Value *R = vectorizeTree(RHSV, 1, CurrIndx);
+      Value *L = vectorizeTree(LHSV);
+      Value *R = vectorizeTree(RHSV);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -2980,9 +2900,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *Cond = vectorizeTree(CondVec, 0, CurrIndx);
-      Value *True = vectorizeTree(TrueVec, 1, CurrIndx);
-      Value *False = vectorizeTree(FalseVec, 2, CurrIndx);
+      Value *Cond = vectorizeTree(CondVec);
+      Value *True = vectorizeTree(TrueVec);
+      Value *False = vectorizeTree(FalseVec);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -3023,8 +2943,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
-      Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -3045,20 +2965,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
       // sink them all the way down past store instructions.
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      if (UserIndx != -1)
-        UserTreeEntry = &VectorizableTree[UserIndx];
-
-      bool isJumbled = false;
-      LoadInst *LI = NULL;
-      if (UserTreeEntry &&
-          (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
-          !UserTreeEntry->ShuffleMask[OpdNum].empty()) {
-        isJumbled = true;
-        LI = cast<LoadInst>(E->Scalars[0]);
-      } else {
-        LI = cast<LoadInst>(VL0);
-      }
-
+      LoadInst *LI = cast<LoadInst>(VL0);
       Type *ScalarLoadTy = LI->getType();
       unsigned AS = LI->getPointerAddressSpace();
 
@@ -3080,21 +2987,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
       ++NumVectorInstructions;
-      propagateMetadata(LI, E->Scalars);
-
-      if (isJumbled) {
-        SmallVector<Constant *, 8> Mask;
-        for (unsigned LaneEntry : UserTreeEntry->ShuffleMask[OpdNum])
-          Mask.push_back(Builder.getInt32(LaneEntry));
-        // Generate shuffle for jumbled memory access
-        Value *Undef = UndefValue::get(VecTy);
-        Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
-                                                  ConstantVector::get(Mask));
-        E->VectorizedValue = Shuf;
-        ++NumVectorInstructions;
-        return Shuf;
-      }
-      return LI;
+      return propagateMetadata(LI, E->Scalars);
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(VL0);
@@ -3107,7 +3000,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *VecValue = vectorizeTree(ScalarStoreValues, 0, CurrIndx);
+      Value *VecValue = vectorizeTree(ScalarStoreValues);
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
@@ -3133,7 +3026,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
       for (Value *V : E->Scalars)
         Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
 
-      Value *Op0 = vectorizeTree(Op0VL, 0, CurrIndx);
+      Value *Op0 = vectorizeTree(Op0VL);
 
       std::vector<Value *> OpVecs;
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
@@ -3142,7 +3035,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
         for (Value *V : E->Scalars)
           OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
 
-        Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
+        Value *OpVec = vectorizeTree(OpVL);
         OpVecs.push_back(OpVec);
       }
 
@@ -3181,7 +3074,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
           OpVL.push_back(CEI->getArgOperand(j));
         }
 
-        Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
+        Value *OpVec = vectorizeTree(OpVL);
         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
       }
@@ -3212,8 +3105,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
       reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
-      Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -3313,14 +3206,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       continue;
     TreeEntry *E = getTreeEntry(Scalar);
     assert(E && "Invalid scalar");
-    assert((!E->NeedToGather) && "Extracting from a gather list");
+    assert(!E->NeedToGather && "Extracting from a gather list");
 
-    Value *Vec = dyn_cast<ShuffleVectorInst>(E->VectorizedValue);
-    if (Vec && dyn_cast<LoadInst>(cast<Instruction>(Vec)->getOperand(0))) {
-      Vec = cast<Instruction>(E->VectorizedValue)->getOperand(0);
-    } else {
-      Vec = E->VectorizedValue;
-    }
+    Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
 
     Value *Lane = Builder.getInt32(ExternalUse.Lane);
@@ -4017,6 +3905,7 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
   // seed additional demotion, we save the truncated value.
   case Instruction::Trunc:
     Roots.push_back(I->getOperand(0));
+    break;
   case Instruction::ZExt:
   case Instruction::SExt:
     break;
diff --git a/test/Analysis/BasicAA/args-rets-allocas-loads.ll b/test/Analysis/BasicAA/args-rets-allocas-loads.ll
index 05b56a07e44b..b31fb26f1c9b 100644
--- a/test/Analysis/BasicAA/args-rets-allocas-loads.ll
+++ b/test/Analysis/BasicAA/args-rets-allocas-loads.ll
@@ -308,4 +308,9 @@ define void @caller_a(double* %arg_a0,
 ; CHECK-NEXT:   0 mod responses (0.0%)
 ; CHECK-NEXT:   0 ref responses (0.0%)
 ; CHECK-NEXT:   140 mod & ref responses (76.0%)
-; CHECK-NEXT:   Alias Analysis Evaluator Mod/Ref Summary: 23%/0%/0%/76%
+; CHECK-NEXT:   0 must responses (0.0%)
+; CHECK-NEXT:   0 must mod responses (0.0%)
+; CHECK-NEXT:   0 must ref responses (0.0%)
+; CHECK-NEXT:   0 must mod & ref responses (0.0%)
+; CHECK-NEXT:   Alias Analysis Evaluator Mod/Ref Summary: 23%/0%/0%/76%/0%/0%/0%/0%
+
diff --git a/test/Analysis/BasicAA/call-attrs.ll b/test/Analysis/BasicAA/call-attrs.ll
index 9cd17e486799..8538e8b4771d 100644
--- a/test/Analysis/BasicAA/call-attrs.ll
+++ b/test/Analysis/BasicAA/call-attrs.ll
@@ -31,12 +31,12 @@ entry:
   ret void
 }
 
-; CHECK:  Just Ref:  Ptr: i8* %p	<->  call void @readonly_attr(i8* %p)
+; CHECK:  Just Ref (MustAlias):  Ptr: i8* %p	<->  call void @readonly_attr(i8* %p)
 ; CHECK:  Just Ref:  Ptr: i8* %p	<->  call void @readonly_func(i8* %p)
-; CHECK:  Just Mod:  Ptr: i8* %p	<->  call void @writeonly_attr(i8* %p)
+; CHECK:  Just Mod (MustAlias):  Ptr: i8* %p	<->  call void @writeonly_attr(i8* %p)
 ; CHECK:  Just Mod:  Ptr: i8* %p	<->  call void @writeonly_func(i8* %p)
 ; CHECK:  NoModRef:  Ptr: i8* %p	<->  call void @readnone_attr(i8* %p)
 ; CHECK:  NoModRef:  Ptr: i8* %p	<->  call void @readnone_func(i8* %p)
 ; CHECK:  Both ModRef:  Ptr: i8* %p	<->  call void @read_write(i8* %p, i8* %p, i8* %p)
-; CHECK:  Just Ref:  Ptr: i8* %p	<->  call void @func() [ "deopt"(i8* %p) ]
+; CHECK:  Just Ref (MustAlias):  Ptr: i8* %p	<->  call void @func() [ "deopt"(i8* %p) ]
 ; CHECK:  Both ModRef:  Ptr: i8* %p	<->  call void @writeonly_attr(i8* %p) [ "deopt"(i8* %p) ]
diff --git a/test/Analysis/BasicAA/cs-cs-arm.ll b/test/Analysis/BasicAA/cs-cs-arm.ll
index 1580af9ea826..e4367bb6d61b 100644
--- a/test/Analysis/BasicAA/cs-cs-arm.ll
+++ b/test/Analysis/BasicAA/cs-cs-arm.ll
@@ -19,11 +19,11 @@ entry:
 ; CHECK-LABEL: Function: test1:
 
 ; CHECK: NoAlias:      i8* %p, i8* %q
-; CHECK: Just Ref:  Ptr: i8* %p        <->  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: Just Ref (MustAlias):  Ptr: i8* %p        <->  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
 ; CHECK: NoModRef:  Ptr: i8* %q        <->  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
 ; CHECK: NoModRef:  Ptr: i8* %p        <->  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK: Both ModRef:  Ptr: i8* %q     <->  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK: Just Ref:  Ptr: i8* %p        <->  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: Both ModRef (MustAlias):  Ptr: i8* %q     <->  call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: Just Ref (MustAlias):  Ptr: i8* %p        <->  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
 ; CHECK: NoModRef:  Ptr: i8* %q        <->  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
 ; CHECK: NoModRef:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <->   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
 ; CHECK: NoModRef:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <->   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
diff --git a/test/Analysis/BasicAA/cs-cs.ll b/test/Analysis/BasicAA/cs-cs.ll
index 3695275649b2..30b7e0720f0b 100644
--- a/test/Analysis/BasicAA/cs-cs.ll
+++ b/test/Analysis/BasicAA/cs-cs.ll
@@ -164,7 +164,7 @@ define void @test4(i8* %P, i8* noalias %Q) #3 {
 ; CHECK-LABEL: Function: test4:
 
 ; CHECK: NoAlias:      i8* %P, i8* %Q
-; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false)
+; CHECK: Just Mod (MustAlias):  Ptr: i8* %P        <->  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false)
 ; CHECK: NoModRef:  Ptr: i8* %Q        <->  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false)
 ; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
 ; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
@@ -192,6 +192,26 @@ define void @test5(i8* %P, i8* %Q, i8* %R) #3 {
 ; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
 }
 
+define void @test5a(i8* noalias %P, i8* noalias %Q, i8* noalias %R) nounwind ssp {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test5a:
+
+; CHECK: NoAlias:     i8* %P, i8* %Q
+; CHECK: NoAlias:     i8* %P, i8* %R
+; CHECK: NoAlias:     i8* %Q, i8* %R
+; CHECK: Just Mod:  Ptr: i8* %P     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:  Ptr: i8* %R     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:  Ptr: i8* %P     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:  Ptr: i8* %Q     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %R     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+}
+
 define void @test6(i8* %P) #3 {
   call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
   call void @a_readonly_func(i8* %P)
@@ -199,7 +219,7 @@ define void @test6(i8* %P) #3 {
 
 ; CHECK-LABEL: Function: test6:
 
-; CHECK: Just Mod:  Ptr: i8* %P        <->  call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
+; CHECK: Just Mod (MustAlias):  Ptr: i8* %P        <->  call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
 ; CHECK: Just Ref:  Ptr: i8* %P        <->  call void @a_readonly_func(i8* %P)
 ; CHECK: Just Mod:   call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) <->   call void @a_readonly_func(i8* %P)
 ; CHECK: Just Ref:   call void @a_readonly_func(i8* %P) <->   call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
@@ -237,9 +257,9 @@ entry:
 ; CHECK: NoModRef:  Ptr: i8* %p <->  call void @an_inaccessiblememonly_func()
 ; CHECK: NoModRef:  Ptr: i8* %q <->  call void @an_inaccessiblememonly_func()
 ; CHECK: NoModRef:  Ptr: i8* %p <->  call void @an_inaccessibleorargmemonly_func(i8* %q)
-; CHECK: Both ModRef:  Ptr: i8* %q <->  call void @an_inaccessibleorargmemonly_func(i8* %q)
+; CHECK: Both ModRef (MustAlias):  Ptr: i8* %q <->  call void @an_inaccessibleorargmemonly_func(i8* %q)
 ; CHECK: NoModRef:  Ptr: i8* %p <->  call void @an_argmemonly_func(i8* %q)
-; CHECK: Both ModRef:  Ptr: i8* %q <->  call void @an_argmemonly_func(i8* %q)
+; CHECK: Both ModRef (MustAlias):  Ptr: i8* %q <->  call void @an_argmemonly_func(i8* %q)
 ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_inaccessiblememonly_func()
 ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_inaccessibleorargmemonly_func(i8* %q)
 ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_argmemonly_func(i8* %q)
@@ -254,12 +274,34 @@ entry:
 ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @a_readonly_func(i8* %p)
 ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @a_writeonly_func(i8* %q)
 ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_inaccessiblememonly_func()
-; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_argmemonly_func(i8* %q)
+; CHECK: Both ModRef (MustAlias): call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_argmemonly_func(i8* %q)
 ; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @a_readonly_func(i8* %p)
 ; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @a_writeonly_func(i8* %q)
 ; CHECK: NoModRef: call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessiblememonly_func()
-; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessibleorargmemonly_func(i8* %q)
+; CHECK: Both ModRef (MustAlias): call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessibleorargmemonly_func(i8* %q)
+}
+
+;; test that MustAlias is set for calls when no MayAlias is found.
+declare void @another_argmemonly_func(i8*, i8*) #0
+define void @test8a(i8* noalias %p, i8* noalias %q) {
+entry:
+  call void @another_argmemonly_func(i8* %p, i8* %q)
+  ret void
+
+; CHECK-LABEL: Function: test8a
+; CHECK: Both ModRef:  Ptr: i8* %p <->  call void @another_argmemonly_func(i8* %p, i8* %q)
+; CHECK: Both ModRef:  Ptr: i8* %q <->  call void @another_argmemonly_func(i8* %p, i8* %q)
 }
+define void @test8b(i8* %p, i8* %q) {
+entry:
+  call void @another_argmemonly_func(i8* %p, i8* %q)
+  ret void
+
+; CHECK-LABEL: Function: test8b
+; CHECK: Both ModRef:  Ptr: i8* %p <->  call void @another_argmemonly_func(i8* %p, i8* %q)
+; CHECK: Both ModRef:  Ptr: i8* %q <->  call void @another_argmemonly_func(i8* %p, i8* %q)
+}
+
 
 ;; test that unknown operand bundle has unknown effect to the heap
 define void @test9(i8* %p) {
@@ -310,9 +352,9 @@ entry:
 ; CHECK: NoModRef:  Ptr: i8* %p        <->  call void @an_inaccessiblememonly_func() #7 [ "unknown"() ]
 ; CHECK: NoModRef:  Ptr: i8* %q        <->  call void @an_inaccessiblememonly_func() #7 [ "unknown"() ]
 ; CHECK: NoModRef:  Ptr: i8* %p        <->  call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
-; CHECK: Both ModRef:  Ptr: i8* %q     <->  call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
+; CHECK: Both ModRef (MustAlias):  Ptr: i8* %q     <->  call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
 ; CHECK: NoModRef:  Ptr: i8* %p        <->  call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
-; CHECK: Both ModRef:  Ptr: i8* %q     <->  call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
+; CHECK: Both ModRef (MustAlias):  Ptr: i8* %q     <->  call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
 ; CHECK: Just Ref:   call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <->   call void @an_inaccessiblememonly_func() #7 [ "unknown"() ]
 ; CHECK: Just Ref:   call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <->   call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
 ; CHECK: Just Ref:   call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <->   call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
@@ -321,10 +363,10 @@ entry:
 ; CHECK: NoModRef:   call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] <->   call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
 ; CHECK: Both ModRef:   call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <->   call void @a_readonly_func(i8* %p) #6 [ "unknown"() ]
 ; CHECK: Both ModRef:   call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <->   call void @an_inaccessiblememonly_func() #7 [ "unknown"() ]
-; CHECK: Both ModRef:   call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <->   call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
+; CHECK: Both ModRef (MustAlias):   call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <->   call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ]
 ; CHECK: Both ModRef:   call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <->   call void @a_readonly_func(i8* %p) #6 [ "unknown"() ]
 ; CHECK: NoModRef:   call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <->   call void @an_inaccessiblememonly_func() #7 [ "unknown"() ]
-; CHECK: Both ModRef:   call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <->   call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
+; CHECK: Both ModRef (MustAlias):   call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <->   call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ]
 }
 
 attributes #0 = { argmemonly nounwind }
diff --git a/test/Analysis/MemorySSA/volatile-clobber.ll b/test/Analysis/MemorySSA/volatile-clobber.ll
index d6f960f3e382..53df7de499bd 100644
--- a/test/Analysis/MemorySSA/volatile-clobber.ll
+++ b/test/Analysis/MemorySSA/volatile-clobber.ll
@@ -22,8 +22,7 @@ define i32 @foo() {
   ret i32 %4
 }
 
-; Ensuring that we don't automatically hoist nonvolatile loads around volatile
-; loads
+; Ensuring we allow hoisting nonvolatile loads around volatile loads.
 ; CHECK-LABEL define void @volatile_only
 define void @volatile_only(i32* %arg1, i32* %arg2) {
   ; Trivially NoAlias/MustAlias
@@ -36,7 +35,7 @@ define void @volatile_only(i32* %arg1, i32* %arg2) {
 ; CHECK: MemoryUse(liveOnEntry)
 ; CHECK-NEXT: load i32, i32* %b
   load i32, i32* %b
-; CHECK: MemoryUse(1)
+; CHECK: MemoryUse(liveOnEntry)
 ; CHECK-NEXT: load i32, i32* %a
   load i32, i32* %a
 
@@ -44,7 +43,7 @@ define void @volatile_only(i32* %arg1, i32* %arg2) {
 ; CHECK: 2 = MemoryDef(1)
 ; CHECK-NEXT: load volatile i32, i32* %arg1
   load volatile i32, i32* %arg1
-; CHECK: MemoryUse(2)
+; CHECK: MemoryUse(liveOnEntry)
 ; CHECK-NEXT: load i32, i32* %arg2
   load i32, i32* %arg2
 
@@ -75,10 +74,10 @@ define void @volatile_atomics(i32* %arg1, i32* %arg2) {
 ; CHECK: MemoryUse(1)
 ; CHECK-NEXT: load atomic i32, i32* %b unordered, align 4
   load atomic i32, i32* %b unordered, align 4
-; CHECK: MemoryUse(2)
+; CHECK: MemoryUse(1)
 ; CHECK-NEXT: load atomic i32, i32* %a unordered, align 4
   load atomic i32, i32* %a unordered, align 4
-; CHECK: MemoryUse(2)
+; CHECK: MemoryUse(1)
 ; CHECK-NEXT: load i32, i32* %a
   load i32, i32* %a
 
@@ -86,7 +85,7 @@ define void @volatile_atomics(i32* %arg1, i32* %arg2) {
 ; CHECK: 3 = MemoryDef(2)
 ; CHECK-NEXT: load atomic volatile i32, i32* %arg1 monotonic, align 4
   load atomic volatile i32, i32* %arg1 monotonic, align 4
-; CHECK: MemoryUse(3)
+; CHECK: MemoryUse(1)
 ; CHECK-NEXT: load i32, i32* %arg2
   load i32, i32* %arg2
 
diff --git a/test/Analysis/ValueTracking/memory-dereferenceable.ll b/test/Analysis/ValueTracking/memory-dereferenceable.ll
index ca16a266123c..2e9453f670ce 100644
--- a/test/Analysis/ValueTracking/memory-dereferenceable.ll
+++ b/test/Analysis/ValueTracking/memory-dereferenceable.ll
@@ -20,7 +20,8 @@ declare i32* @foo()
 @globalptr.align16 = external global i8, align 16
 
 ; CHECK-LABEL: 'test'
-define void @test(i32 addrspace(1)* dereferenceable(8) %dparam,
+define void @test(%struct.A* sret %result,
+                  i32 addrspace(1)* dereferenceable(8) %dparam,
                   i8 addrspace(1)* dereferenceable(32) align 1 %dparam.align1,
                   i8 addrspace(1)* dereferenceable(32) align 16 %dparam.align16,
                   i8* byval %i8_byval,
@@ -41,17 +42,14 @@ entry:
     %empty_alloca = alloca i8, i64 0
     %empty_load = load i8, i8* %empty_alloca
 
-    ; Load from too small array alloca
-; CHECK-NOT: %small_array_alloca
-    %small_array_alloca = alloca i8, i64 2
-    %saa_cast = bitcast i8* %small_array_alloca to i32*
-    %saa_load = load i32, i32* %saa_cast
-
-    ; Load from array alloca
-; CHECK: %big_array_alloca{{.*}}(unaligned)
-    %big_array_alloca = alloca i8, i64 4
-    %baa_cast = bitcast i8* %big_array_alloca to i32*
-    %baa_load = load i32, i32* %baa_cast
+    ; Loads from sret arguments
+; CHECK: %sret_gep{{.*}}(aligned)
+    %sret_gep = getelementptr inbounds %struct.A, %struct.A* %result, i64 0, i32 1, i64 2
+    load i8, i8* %sret_gep
+
+; CHECK-NOT: %sret_gep_outside
+    %sret_gep_outside = getelementptr %struct.A, %struct.A* %result, i64 0, i32 1, i64 7
+    load i8, i8* %sret_gep_outside
 
 ; CHECK: %dparam{{.*}}(aligned)
     %load3 = load i32, i32 addrspace(1)* %dparam
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index 86ac5507a407..111aaf88b160 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -135,7 +135,7 @@ continue:
 }
 
 ; Check that we fallback on invoke translation failures.
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %0:_(s128) = G_FCONSTANT quad 2
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %0:_(s128) = G_FCONSTANT fp128 0xL00000000000000004000000000000000
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_quad_dump
 ; FALLBACK-WITH-REPORT-OUT-LABEL: test_quad_dump:
 define fp128 @test_quad_dump() {
diff --git a/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir b/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir
new file mode 100644
index 000000000000..47fda8f998d7
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir
@@ -0,0 +1,44 @@
+# RUN: llc -O0 -run-pass=legalizer -global-isel -global-isel-abort=0 %s -o - | FileCheck %s
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64"
+  
+  define fp128 @x(fp128 %a) {
+  entry:
+    %a.addr = alloca fp128, align 16
+    store fp128 %a, fp128* %a.addr, align 16
+    %0 = load fp128, fp128* %a.addr, align 16
+    %sub = fsub fp128 0xL00000000000000008000000000000000, %0
+    ret fp128 %sub
+  }
+  
+...
+---
+name:            x
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+fixedStack:      
+stack:           
+  - { id: 0, name: a.addr, type: default, offset: 0, size: 16, alignment: 16, 
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true, 
+      di-variable: '', di-expression: '', di-location: '' }
+body:             |
+  bb.1.entry:
+    liveins: %q0
+
+    ; This test just checks we don't crash on G_FNEG of FP128 types. Expect to fall
+    ; back until support is added for fp128.
+    ; CHECK: ret
+    %0:_(s128) = COPY %q0
+    %1:_(p0) = G_FRAME_INDEX %stack.0.a.addr
+    G_STORE %0(s128), %1(p0) :: (store 16 into %ir.a.addr)
+    %2:_(s128) = G_LOAD %1(p0) :: (load 16 from %ir.a.addr)
+    %3:_(s128) = G_FNEG %2
+    %q0 = COPY %3(s128)
+    RET_ReallyLR implicit %q0
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
index 865315bbe0a3..4b69575079a3 100644
--- a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
+++ b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
@@ -1,85 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -O0 -global-isel -stop-after=irtranslator -o - %s | FileCheck %s
 
 %type = type [4 x {i8, i32}]
 
 define %type* @first_offset_const(%type* %addr) {
-; CHECK-LABEL: name: first_offset_const
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-; CHECK: [[RES:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64)
-; CHECK: %x0 = COPY [[RES]](p0)
 
+  ; CHECK-LABEL: name: first_offset_const
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: %x0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+  ; CHECK:   [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+  ; CHECK:   %x0 = COPY [[GEP]](p0)
+  ; CHECK:   RET_ReallyLR implicit %x0
   %res = getelementptr %type, %type* %addr, i32 1
   ret %type* %res
 }
 
 define %type* @first_offset_trivial(%type* %addr) {
-; CHECK-LABEL: name: first_offset_trivial
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[TRIVIAL:%[0-9]+]]:_(p0) = COPY [[BASE]](p0)
-; CHECK: %x0 = COPY [[TRIVIAL]](p0)
 
+  ; CHECK-LABEL: name: first_offset_trivial
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: %x0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(p0) = COPY [[COPY]](p0)
+  ; CHECK:   %x0 = COPY [[COPY1]](p0)
+  ; CHECK:   RET_ReallyLR implicit %x0
   %res = getelementptr %type, %type* %addr, i32 0
   ret %type* %res
 }
 
 define %type* @first_offset_variable(%type* %addr, i64 %idx) {
-; CHECK-LABEL: name: first_offset_variable
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1
-; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]]
-; CHECK: [[STEP0:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64)
-; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[STEP0]](p0)
-; CHECK: %x0 = COPY [[RES]](p0)
 
+  ; CHECK-LABEL: name: first_offset_variable
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: %x0, %x1
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY %x1
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+  ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[COPY1]]
+  ; CHECK:   [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64)
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP]](p0)
+  ; CHECK:   %x0 = COPY [[COPY2]](p0)
+  ; CHECK:   RET_ReallyLR implicit %x0
   %res = getelementptr %type, %type* %addr, i64 %idx
   ret %type* %res
 }
 
 define %type* @first_offset_ext(%type* %addr, i32 %idx) {
-; CHECK-LABEL: name: first_offset_ext
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[IDX32:%[0-9]+]]:_(s32) = COPY %w1
-; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-; CHECK: [[IDX64:%[0-9]+]]:_(s64) = G_SEXT [[IDX32]](s32)
-; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX64]]
-; CHECK: [[STEP0:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64)
-; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[STEP0]](p0)
-; CHECK: %x0 = COPY [[RES]](p0)
 
+  ; CHECK-LABEL: name: first_offset_ext
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: %w1, %x0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY %w1
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+  ; CHECK:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
+  ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[SEXT]]
+  ; CHECK:   [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64)
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP]](p0)
+  ; CHECK:   %x0 = COPY [[COPY2]](p0)
+  ; CHECK:   RET_ReallyLR implicit %x0
   %res = getelementptr %type, %type* %addr, i32 %idx
   ret %type* %res
 }
 
 %type1 = type [4 x [4 x i32]]
 define i32* @const_then_var(%type1* %addr, i64 %idx) {
-; CHECK-LABEL: name: const_then_var
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1
-; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_CONSTANT i64 272
-; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-; CHECK: [[BASE1:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET1]](s64)
-; CHECK: [[OFFSET2:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]]
-; CHECK: [[BASE2:%[0-9]+]]:_(p0) = G_GEP [[BASE1]], [[OFFSET2]](s64)
-; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[BASE2]](p0)
-; CHECK: %x0 = COPY [[RES]](p0)
 
+  ; CHECK-LABEL: name: const_then_var
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: %x0, %x1
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY %x1
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 272
+  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+  ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[COPY1]]
+  ; CHECK:   [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[GEP]], [[MUL]](s64)
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP1]](p0)
+  ; CHECK:   %x0 = COPY [[COPY2]](p0)
+  ; CHECK:   RET_ReallyLR implicit %x0
   %res = getelementptr %type1, %type1* %addr, i32 4, i32 1, i64 %idx
   ret i32* %res
 }
 
 define i32* @var_then_const(%type1* %addr, i64 %idx) {
-; CHECK-LABEL: name: var_then_const
-; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0
-; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1
-; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-; CHECK: [[OFFSET2:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
-; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]]
-; CHECK: [[BASE1:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET1]](s64)
-; CHECK: [[BASE2:%[0-9]+]]:_(p0) = G_GEP [[BASE1]], [[OFFSET2]](s64)
-; CHECK: %x0 = COPY [[BASE2]](p0)
 
+  ; CHECK-LABEL: name: var_then_const
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: %x0, %x1
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY %x1
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
+  ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[COPY1]]
+  ; CHECK:   [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64)
+  ; CHECK:   [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[GEP]], [[C1]](s64)
+  ; CHECK:   %x0 = COPY [[GEP1]](p0)
+  ; CHECK:   RET_ReallyLR implicit %x0
   %res = getelementptr %type1, %type1* %addr, i64 %idx, i32 2, i32 2
   ret i32* %res
 }
diff --git a/test/CodeGen/AArch64/arm64-jumptable.ll b/test/CodeGen/AArch64/arm64-jumptable.ll
index f5c2ee6da0bf..fac3e5704d15 100644
--- a/test/CodeGen/AArch64/arm64-jumptable.ll
+++ b/test/CodeGen/AArch64/arm64-jumptable.ll
@@ -6,22 +6,20 @@ define void @sum(i32 %a, i32* %to, i32 %c) {
 entry:
   switch i32 %a, label %exit [
     i32 1, label %bb1
-    i32 2, label %bb2
+    i32 2, label %exit.sink.split
     i32 3, label %bb3
     i32 4, label %bb4
   ]
 bb1:
   %b = add i32 %c, 1
-  store i32 %b, i32* %to
-  br label %exit
-bb2:
-  store i32 2, i32* %to
-  br label %exit
+  br label %exit.sink.split
 bb3:
-  store i32 3, i32* %to
-  br label %exit
+  br label %exit.sink.split
 bb4:
-  store i32 5, i32* %to
+  br label %exit.sink.split
+exit.sink.split:
+  %.sink = phi i32 [ 5, %bb4 ], [ %b, %bb1 ], [ 3, %bb3 ], [ %a, %entry ]
+  store i32 %.sink, i32* %to
   br label %exit
 exit:
   ret void
diff --git a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
index 29036caabf3a..3466e1bace56 100644
--- a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
+++ b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
@@ -4,9 +4,10 @@
 ; RUN:   FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s
 ; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
 
-; CHECK: @fct1
+; CHECK-LABEL: fct1:
 ; For small size (<= 256), we do not change memset to bzero.
-; CHECK: memset
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
 define void @fct1(i8* nocapture %ptr) {
 entry:
   tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false)
@@ -15,20 +16,20 @@ entry:
 
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 
-; CHECK: @fct2
+; CHECK-LABEL: fct2:
 ; When the size is bigger than 256, change into bzero.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
 define void @fct2(i8* nocapture %ptr) {
 entry:
   tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
   ret void
 }
 
-; CHECK: @fct3
+; CHECK-LABEL: fct3:
 ; For unknown size, change to bzero.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
 define void @fct3(i8* nocapture %ptr, i32 %unknown) {
 entry:
   %conv = sext i32 %unknown to i64
@@ -36,9 +37,10 @@ entry:
   ret void
 }
 
-; CHECK: @fct4
+; CHECK-LABEL: fct4:
 ; Size <= 256, no change.
-; CHECK: memset
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
 define void @fct4(i8* %ptr) {
 entry:
   %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
@@ -50,10 +52,10 @@ declare i8* @__memset_chk(i8*, i32, i64, i64)
 
 declare i64 @llvm.objectsize.i64(i8*, i1)
 
-; CHECK: @fct5
+; CHECK-LABEL: fct5:
 ; Size > 256, change.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
 define void @fct5(i8* %ptr) {
 entry:
   %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
@@ -61,10 +63,10 @@ entry:
   ret void
 }
 
-; CHECK: @fct6
+; CHECK-LABEL: fct6:
 ; Size = unknown, change.
-; CHECK-DARWIN: bzero
-; CHECK-LINUX: memset
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
 define void @fct6(i8* %ptr, i32 %unknown) {
 entry:
   %conv = sext i32 %unknown to i64
@@ -76,9 +78,10 @@ entry:
 ; Next functions check that memset is not turned into bzero
 ; when the set constant is non-zero, whatever the given size.
 
-; CHECK: @fct7
+; CHECK-LABEL: fct7:
 ; memset with something that is not a zero, no change.
-; CHECK: memset
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
 define void @fct7(i8* %ptr) {
 entry:
   %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
@@ -86,9 +89,10 @@ entry:
   ret void
 }
 
-; CHECK: @fct8
+; CHECK-LABEL: fct8:
 ; memset with something that is not a zero, no change.
-; CHECK: memset
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
 define void @fct8(i8* %ptr) {
 entry:
   %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
@@ -96,9 +100,10 @@ entry:
   ret void
 }
 
-; CHECK: @fct9
+; CHECK-LABEL: fct9:
 ; memset with something that is not a zero, no change.
-; CHECK: memset
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
 define void @fct9(i8* %ptr, i32 %unknown) {
 entry:
   %conv = sext i32 %unknown to i64
diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll
index d22bfc76d1d5..b3a2bcd5d669 100644
--- a/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast                 | FileCheck %s --check-prefixes=CHECK,GENERIC
 ; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check.
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck %s --check-prefixes=CHECK,EXYNOSM1
 
 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
 
@@ -47,7 +47,6 @@ declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmla_lane_s16:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %b
@@ -58,7 +57,6 @@ entry:
 define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlaq_lane_s16:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %b
@@ -69,7 +67,6 @@ entry:
 define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmla_lane_s32:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %b
@@ -80,7 +77,6 @@ entry:
 define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlaq_lane_s32:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %b
@@ -91,7 +87,6 @@ entry:
 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmla_laneq_s16:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -102,7 +97,6 @@ entry:
 define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlaq_laneq_s16:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %b
@@ -113,7 +107,6 @@ entry:
 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmla_laneq_s32:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -124,7 +117,6 @@ entry:
 define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlaq_laneq_s32:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %b
@@ -135,7 +127,6 @@ entry:
 define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmls_lane_s16:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %b
@@ -146,7 +137,6 @@ entry:
 define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsq_lane_s16:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %b
@@ -157,7 +147,6 @@ entry:
 define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmls_lane_s32:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %b
@@ -168,7 +157,6 @@ entry:
 define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsq_lane_s32:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %b
@@ -179,7 +167,6 @@ entry:
 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmls_laneq_s16:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -190,7 +177,6 @@ entry:
 define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsq_laneq_s16:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %b
@@ -201,7 +187,6 @@ entry:
 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmls_laneq_s32:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -212,7 +197,6 @@ entry:
 define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsq_laneq_s32:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %b
@@ -223,7 +207,6 @@ entry:
 define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmul_lane_s16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %a
@@ -233,7 +216,6 @@ entry:
 define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_lane_s16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %a
@@ -243,7 +225,6 @@ entry:
 define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmul_lane_s32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %a
@@ -253,7 +234,6 @@ entry:
 define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_lane_s32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %a
@@ -263,7 +243,6 @@ entry:
 define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmul_lane_u16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i16> %shuffle, %a
@@ -273,7 +252,6 @@ entry:
 define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_lane_u16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %mul = mul <8 x i16> %shuffle, %a
@@ -283,7 +261,6 @@ entry:
 define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmul_lane_u32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %mul = mul <2 x i32> %shuffle, %a
@@ -293,7 +270,6 @@ entry:
 define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_lane_u32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = mul <4 x i32> %shuffle, %a
@@ -303,7 +279,6 @@ entry:
 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmul_laneq_s16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -313,7 +288,6 @@ entry:
 define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_s16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %a
@@ -323,7 +297,6 @@ entry:
 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmul_laneq_s32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -333,7 +306,6 @@ entry:
 define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_s32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %a
@@ -343,7 +315,6 @@ entry:
 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmul_laneq_u16:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -353,7 +324,6 @@ entry:
 define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_u16:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %mul = mul <8 x i16> %shuffle, %a
@@ -363,7 +333,6 @@ entry:
 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmul_laneq_u32:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -373,7 +342,6 @@ entry:
 define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_u32:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = mul <4 x i32> %shuffle, %a
@@ -382,12 +350,9 @@ entry:
 
 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfma_lane_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfma_lane_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -398,12 +363,9 @@ declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 
 define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfmaq_lane_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_lane_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -414,12 +376,9 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfma_laneq_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfma_laneq_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -428,12 +387,9 @@ entry:
 
 define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfmaq_laneq_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_laneq_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -442,12 +398,9 @@ entry:
 
 define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfms_lane_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfms_lane_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
@@ -457,12 +410,9 @@ entry:
 
 define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfmsq_lane_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_lane_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -472,12 +422,9 @@ entry:
 
 define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfms_laneq_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfms_laneq_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
@@ -487,12 +434,9 @@ entry:
 
 define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfmsq_laneq_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_laneq_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -502,12 +446,9 @@ entry:
 
 define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
 ; CHECK-LABEL: test_vfmaq_lane_f64:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_lane_f64:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -518,12 +459,9 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
 
 define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
 ; CHECK-LABEL: test_vfmaq_laneq_f64:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_laneq_f64:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -532,12 +470,9 @@ entry:
 
 define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
 ; CHECK-LABEL: test_vfmsq_lane_f64:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_lane_f64:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %sub = fsub <1 x double> <double -0.000000e+00>, %v
   %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
@@ -547,12 +482,9 @@ entry:
 
 define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
 ; CHECK-LABEL: test_vfmsq_laneq_f64:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_laneq_f64:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -563,10 +495,6 @@ entry:
 define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfmas_laneq_f32
 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXNOS-LABEL: test_vfmas_laneq_f32
-; EXNOS: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; EXNOS-NEXT: ret
 entry:
   %extract = extractelement <4 x float> %v, i32 3
   %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
@@ -578,7 +506,6 @@ declare float @llvm.fma.f32(float, float, float)
 define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
 ; CHECK-LABEL: test_vfmsd_lane_f64
 ; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: ret
 entry:
   %extract.rhs = extractelement <1 x double> %v, i32 0
   %extract = fsub double -0.000000e+00, %extract.rhs
@@ -591,10 +518,6 @@ declare double @llvm.fma.f64(double, double, double)
 define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfmss_lane_f32
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmss_lane_f32
-; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-; EXYNOS-NEXT: ret
 entry:
   %extract.rhs = extractelement <2 x float> %v, i32 1
   %extract = fsub float -0.000000e+00, %extract.rhs
@@ -605,7 +528,6 @@ entry:
 define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfmss_laneq_f32
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %extract.rhs = extractelement <4 x float> %v, i32 3
   %extract = fsub float -0.000000e+00, %extract.rhs
@@ -616,10 +538,6 @@ entry:
 define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
 ; CHECK-LABEL: test_vfmsd_laneq_f64
 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsd_laneq_f64
-; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; EXYNOS-NEXT: ret
 entry:
   %extract.rhs = extractelement <2 x double> %v, i32 1
   %extract = fsub double -0.000000e+00, %extract.rhs
@@ -641,10 +559,6 @@ entry:
 define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfmss_lane_f32_0
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmss_lane_f32_0
-; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-; EXYNOS-NEXT: ret
 entry:
   %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %tmp1 = extractelement <2 x float> %tmp0, i32 1
@@ -655,7 +569,6 @@ entry:
 define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfmss_laneq_f32_0
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %tmp1 = extractelement <4 x float> %tmp0, i32 3
@@ -666,7 +579,6 @@ entry:
 define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) {
 ; CHECK-LABEL: test_vfmsd_laneq_f64_0
 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
 entry:
   %tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v
   %tmp1 = extractelement <2 x double> %tmp0, i32 1
@@ -677,7 +589,6 @@ entry:
 define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_lane_s16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -688,7 +599,6 @@ entry:
 define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_lane_s32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -699,7 +609,6 @@ entry:
 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_s16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -710,7 +619,6 @@ entry:
 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_s32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -721,7 +629,6 @@ entry:
 define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_s16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -733,7 +640,6 @@ entry:
 define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_s32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -745,7 +651,6 @@ entry:
 define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_s16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -757,7 +662,6 @@ entry:
 define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_s32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -769,7 +673,6 @@ entry:
 define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_s16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -780,7 +683,6 @@ entry:
 define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_s32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -791,7 +693,6 @@ entry:
 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_s16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -802,7 +703,6 @@ entry:
 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_s32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -813,7 +713,6 @@ entry:
 define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_s16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -825,7 +724,6 @@ entry:
 define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_s32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -837,7 +735,6 @@ entry:
 define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_s16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -849,7 +746,6 @@ entry:
 define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_s32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -861,7 +757,6 @@ entry:
 define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_lane_u16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -872,7 +767,6 @@ entry:
 define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_lane_u32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -883,7 +777,6 @@ entry:
 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_u16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -894,7 +787,6 @@ entry:
 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_u32:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -905,7 +797,6 @@ entry:
 define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_u16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -917,7 +808,6 @@ entry:
 define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_u32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -929,7 +819,6 @@ entry:
 define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_u16:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -941,7 +830,6 @@ entry:
 define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_u32:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -953,7 +841,6 @@ entry:
 define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_u16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -964,7 +851,6 @@ entry:
 define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_u32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -975,7 +861,6 @@ entry:
 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_u16:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -986,7 +871,6 @@ entry:
 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_u32:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -997,7 +881,6 @@ entry:
 define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_u16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1009,7 +892,6 @@ entry:
 define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_u32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1021,7 +903,6 @@ entry:
 define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_u16:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1033,7 +914,6 @@ entry:
 define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_u32:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1045,7 +925,6 @@ entry:
 define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_lane_s16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1055,7 +934,6 @@ entry:
 define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_lane_s32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1065,7 +943,6 @@ entry:
 define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_lane_u16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1075,7 +952,6 @@ entry:
 define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_lane_u32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1085,7 +961,6 @@ entry:
 define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_s16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1096,7 +971,6 @@ entry:
 define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_s32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1107,7 +981,6 @@ entry:
 define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_u16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1118,7 +991,6 @@ entry:
 define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_u32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1129,7 +1001,6 @@ entry:
 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_laneq_s16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1139,7 +1010,6 @@ entry:
 define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_laneq_s32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1149,7 +1019,6 @@ entry:
 define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_laneq_u16:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1159,7 +1028,6 @@ entry:
 define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_laneq_u32:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1169,7 +1037,6 @@ entry:
 define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_s16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1180,7 +1047,6 @@ entry:
 define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_s32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1191,7 +1057,6 @@ entry:
 define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_u16:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1202,7 +1067,6 @@ entry:
 define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_u32:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1213,7 +1077,6 @@ entry:
 define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlal_lane_s16:
 ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1224,7 +1087,6 @@ entry:
 define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlal_lane_s32:
 ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1235,7 +1097,6 @@ entry:
 define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlal_high_lane_s16:
 ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1247,7 +1108,6 @@ entry:
 define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlal_high_lane_s32:
 ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1259,7 +1119,6 @@ entry:
 define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlsl_lane_s16:
 ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1270,7 +1129,6 @@ entry:
 define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlsl_lane_s32:
 ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1281,7 +1139,6 @@ entry:
 define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1293,7 +1150,6 @@ entry:
 define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1305,7 +1161,6 @@ entry:
 define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_lane_s16:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1315,7 +1170,6 @@ entry:
 define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_lane_s32:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1325,7 +1179,6 @@ entry:
 define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_laneq_s16:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1335,7 +1188,6 @@ entry:
 define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_laneq_s32:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1345,7 +1197,6 @@ entry:
 define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_high_lane_s16:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -1356,7 +1207,6 @@ entry:
 define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_high_lane_s32:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -1367,7 +1217,6 @@ entry:
 define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_high_laneq_s16:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
@@ -1378,7 +1227,6 @@ entry:
 define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_high_laneq_s32:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
@@ -1389,7 +1237,6 @@ entry:
 define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmulh_lane_s16:
 ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1399,7 +1246,6 @@ entry:
 define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s16:
 ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -1409,7 +1255,6 @@ entry:
 define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulh_lane_s32:
 ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1419,7 +1264,6 @@ entry:
 define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s32:
 ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -1429,7 +1273,6 @@ entry:
 define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s16:
 ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1439,7 +1282,6 @@ entry:
 define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s16:
 ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -1449,7 +1291,6 @@ entry:
 define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s32:
 ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1459,7 +1300,6 @@ entry:
 define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s32:
 ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -1468,12 +1308,9 @@ entry:
 
 define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
 ; CHECK-LABEL: test_vmul_lane_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_lane_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %mul = fmul <2 x float> %shuffle, %a
@@ -1483,10 +1320,6 @@ entry:
 define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
 ; CHECK-LABEL: test_vmul_lane_f64:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_lane_f64:
-; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; EXYNOS-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -1498,12 +1331,9 @@ entry:
 
 define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
 ; CHECK-LABEL: test_vmulq_lane_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_lane_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = fmul <4 x float> %shuffle, %a
@@ -1512,12 +1342,9 @@ entry:
 
 define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
 ; CHECK-LABEL: test_vmulq_lane_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_lane_f64:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x double> %shuffle, %a
@@ -1526,12 +1353,9 @@ entry:
 
 define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
 ; CHECK-LABEL: test_vmul_laneq_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_laneq_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %mul = fmul <2 x float> %shuffle, %a
@@ -1541,10 +1365,6 @@ entry:
 define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmul_laneq_f64:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_laneq_f64:
-; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; EXYNOS-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -1556,12 +1376,9 @@ entry:
 
 define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_laneq_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %mul = fmul <4 x float> %shuffle, %a
@@ -1570,12 +1387,9 @@ entry:
 
 define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_laneq_f64:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %mul = fmul <2 x double> %shuffle, %a
@@ -1584,12 +1398,9 @@ entry:
 
 define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
 ; CHECK-LABEL: test_vmulx_lane_f32:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulx_lane_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1598,12 +1409,9 @@ entry:
 
 define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
 ; CHECK-LABEL: test_vmulxq_lane_f32:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_lane_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; Exynos-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1612,12 +1420,9 @@ entry:
 
 define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
 ; CHECK-LABEL: test_vmulxq_lane_f64:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_lane_f64:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1626,12 +1431,9 @@ entry:
 
 define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
 ; CHECK-LABEL: test_vmulx_laneq_f32:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulx_laneq_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
-; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -1640,12 +1442,9 @@ entry:
 
 define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
 ; CHECK-LABEL: test_vmulxq_laneq_f32:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_laneq_f32:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -1654,12 +1453,9 @@ entry:
 
 define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmulxq_laneq_f64:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_laneq_f64:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
-; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
+; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -1669,7 +1465,6 @@ entry:
 define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmla_lane_s16_0:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1680,7 +1475,6 @@ entry:
 define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlaq_lane_s16_0:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1691,7 +1485,6 @@ entry:
 define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmla_lane_s32_0:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1702,7 +1495,6 @@ entry:
 define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlaq_lane_s32_0:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1713,7 +1505,6 @@ entry:
 define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmla_laneq_s16_0:
 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1724,7 +1515,6 @@ entry:
 define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlaq_laneq_s16_0:
 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1735,7 +1525,6 @@ entry:
 define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmla_laneq_s32_0:
 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1746,7 +1535,6 @@ entry:
 define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlaq_laneq_s32_0:
 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1757,7 +1545,6 @@ entry:
 define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmls_lane_s16_0:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1768,7 +1555,6 @@ entry:
 define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsq_lane_s16_0:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1779,7 +1565,6 @@ entry:
 define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmls_lane_s32_0:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1790,7 +1575,6 @@ entry:
 define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsq_lane_s32_0:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1801,7 +1585,6 @@ entry:
 define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmls_laneq_s16_0:
 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -1812,7 +1595,6 @@ entry:
 define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsq_laneq_s16_0:
 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %b
@@ -1823,7 +1605,6 @@ entry:
 define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmls_laneq_s32_0:
 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -1834,7 +1615,6 @@ entry:
 define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsq_laneq_s32_0:
 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %b
@@ -1845,7 +1625,6 @@ entry:
 define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmul_lane_s16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1855,7 +1634,6 @@ entry:
 define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_lane_s16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1865,7 +1643,6 @@ entry:
 define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmul_lane_s32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1875,7 +1652,6 @@ entry:
 define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_lane_s32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1885,7 +1661,6 @@ entry:
 define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmul_lane_u16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1895,7 +1670,6 @@ entry:
 define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_lane_u16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1905,7 +1679,6 @@ entry:
 define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmul_lane_u32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1915,7 +1688,6 @@ entry:
 define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_lane_u32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1925,7 +1697,6 @@ entry:
 define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmul_laneq_s16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1935,7 +1706,6 @@ entry:
 define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_s16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1945,7 +1715,6 @@ entry:
 define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmul_laneq_s32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1955,7 +1724,6 @@ entry:
 define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_s32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -1965,7 +1733,6 @@ entry:
 define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmul_laneq_u16_0:
 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -1975,7 +1742,6 @@ entry:
 define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_u16_0:
 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   %mul = mul <8 x i16> %shuffle, %a
@@ -1985,7 +1751,6 @@ entry:
 define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmul_laneq_u32_0:
 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -1995,7 +1760,6 @@ entry:
 define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_u32_0:
 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i32> %shuffle, %a
@@ -2004,12 +1768,9 @@ entry:
 
 define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfma_lane_f32_0:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfma_lane_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -2018,12 +1779,9 @@ entry:
 
 define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfmaq_lane_f32_0:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_lane_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -2032,12 +1790,9 @@ entry:
 
 define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfma_laneq_f32_0:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfma_laneq_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -2046,12 +1801,9 @@ entry:
 
 define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfmaq_laneq_f32_0:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_laneq_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
@@ -2060,12 +1812,9 @@ entry:
 
 define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfms_lane_f32_0:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfms_lane_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
@@ -2075,12 +1824,9 @@ entry:
 
 define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
 ; CHECK-LABEL: test_vfmsq_lane_f32_0:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_lane_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
@@ -2090,12 +1836,9 @@ entry:
 
 define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfms_laneq_f32_0:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfms_laneq_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
@@ -2105,12 +1848,9 @@ entry:
 
 define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfmsq_laneq_f32_0:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_laneq_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
@@ -2120,12 +1860,9 @@ entry:
 
 define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
 ; CHECK-LABEL: test_vfmaq_laneq_f64_0:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmaq_laneq_f64_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
@@ -2134,12 +1871,9 @@ entry:
 
 define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
 ; CHECK-LABEL: test_vfmsq_laneq_f64_0:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vfmsq_laneq_f64_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
@@ -2150,7 +1884,6 @@ entry:
 define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_lane_s16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2161,7 +1894,6 @@ entry:
 define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_lane_s32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2172,7 +1904,6 @@ entry:
 define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_s16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2183,7 +1914,6 @@ entry:
 define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_s32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2194,7 +1924,6 @@ entry:
 define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_s16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2206,7 +1935,6 @@ entry:
 define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_s32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2218,7 +1946,6 @@ entry:
 define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2230,7 +1957,6 @@ entry:
 define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2242,7 +1968,6 @@ entry:
 define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_s16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2253,7 +1978,6 @@ entry:
 define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_s32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2264,7 +1988,6 @@ entry:
 define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_s16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2275,7 +1998,6 @@ entry:
 define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_s32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2286,7 +2008,6 @@ entry:
 define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2298,7 +2019,6 @@ entry:
 define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2310,7 +2030,6 @@ entry:
 define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2322,7 +2041,6 @@ entry:
 define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2334,7 +2052,6 @@ entry:
 define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_lane_u16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2345,7 +2062,6 @@ entry:
 define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_lane_u32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2356,7 +2072,6 @@ entry:
 define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_u16_0:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2367,7 +2082,6 @@ entry:
 define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_laneq_u32_0:
 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2378,7 +2092,6 @@ entry:
 define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_u16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2390,7 +2103,6 @@ entry:
 define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_lane_u32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2402,7 +2114,6 @@ entry:
 define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2414,7 +2125,6 @@ entry:
 define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2426,7 +2136,6 @@ entry:
 define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_u16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2437,7 +2146,6 @@ entry:
 define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_lane_u32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2448,7 +2156,6 @@ entry:
 define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_u16_0:
 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2459,7 +2166,6 @@ entry:
 define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_laneq_u32_0:
 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2470,7 +2176,6 @@ entry:
 define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2482,7 +2187,6 @@ entry:
 define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2494,7 +2198,6 @@ entry:
 define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2506,7 +2209,6 @@ entry:
 define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2518,7 +2220,6 @@ entry:
 define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_lane_s16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2528,7 +2229,6 @@ entry:
 define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_lane_s32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2538,7 +2238,6 @@ entry:
 define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_lane_u16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2548,7 +2247,6 @@ entry:
 define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_lane_u32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2558,7 +2256,6 @@ entry:
 define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_s16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2569,7 +2266,6 @@ entry:
 define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_s32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2580,7 +2276,6 @@ entry:
 define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_u16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2591,7 +2286,6 @@ entry:
 define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_lane_u32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2602,7 +2296,6 @@ entry:
 define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_laneq_s16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2612,7 +2305,6 @@ entry:
 define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_laneq_s32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2622,7 +2314,6 @@ entry:
 define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_laneq_u16_0:
 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2632,7 +2323,6 @@ entry:
 define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_laneq_u32_0:
 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2642,7 +2332,6 @@ entry:
 define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_s16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2653,7 +2342,6 @@ entry:
 define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_s32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2664,7 +2352,6 @@ entry:
 define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_u16_0:
 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2675,7 +2362,6 @@ entry:
 define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vmull_high_laneq_u32_0:
 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2686,7 +2372,6 @@ entry:
 define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlal_lane_s16_0:
 ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2697,7 +2382,6 @@ entry:
 define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlal_lane_s32_0:
 ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2708,7 +2392,6 @@ entry:
 define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
 ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2720,7 +2403,6 @@ entry:
 define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
 ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2732,7 +2414,6 @@ entry:
 define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
 ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -2743,7 +2424,6 @@ entry:
 define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
 ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -2754,7 +2434,6 @@ entry:
 define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2766,7 +2445,6 @@ entry:
 define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
 ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2778,7 +2456,6 @@ entry:
 define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_lane_s16_0:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2788,7 +2465,6 @@ entry:
 define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_lane_s32_0:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2798,7 +2474,6 @@ entry:
 define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_laneq_s16_0:
 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2808,7 +2483,6 @@ entry:
 define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_laneq_s32_0:
 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2818,7 +2492,6 @@ entry:
 define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -2829,7 +2502,6 @@ entry:
 define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -2840,7 +2512,6 @@ entry:
 define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
@@ -2851,7 +2522,6 @@ entry:
 define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
@@ -2862,7 +2532,6 @@ entry:
 define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmulh_lane_s16_0:
 ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2872,7 +2541,6 @@ entry:
 define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
 ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -2882,7 +2550,6 @@ entry:
 define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulh_lane_s32_0:
 ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2892,7 +2559,6 @@ entry:
 define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
 ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -2902,7 +2568,6 @@ entry:
 define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
@@ -2912,7 +2577,6 @@ entry:
 define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
@@ -2922,7 +2586,6 @@ entry:
 define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
@@ -2932,7 +2595,6 @@ entry:
 define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
 ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
@@ -2941,12 +2603,9 @@ entry:
 
 define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
 ; CHECK-LABEL: test_vmul_lane_f32_0:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_lane_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x float> %shuffle, %a
@@ -2955,12 +2614,9 @@ entry:
 
 define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
 ; CHECK-LABEL: test_vmulq_lane_f32_0:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_lane_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %mul = fmul <4 x float> %shuffle, %a
@@ -2969,12 +2625,9 @@ entry:
 
 define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
 ; CHECK-LABEL: test_vmul_laneq_f32_0:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_laneq_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x float> %shuffle, %a
@@ -2984,10 +2637,6 @@ entry:
 define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmul_laneq_f64_0:
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmul_laneq_f64_0:
-; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; EXYNOS-NEXT: ret
 entry:
   %0 = bitcast <1 x double> %a to <8 x i8>
   %1 = bitcast <8 x i8> %0 to double
@@ -2999,12 +2648,9 @@ entry:
 
 define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_f32_0:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_laneq_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %mul = fmul <4 x float> %shuffle, %a
@@ -3013,12 +2659,9 @@ entry:
 
 define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmulq_laneq_f64_0:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulq_laneq_f64_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x double> %shuffle, %a
@@ -3027,12 +2670,9 @@ entry:
 
 define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
 ; CHECK-LABEL: test_vmulx_lane_f32_0:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulx_lane_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -3041,12 +2681,9 @@ entry:
 
 define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
 ; CHECK-LABEL: test_vmulxq_lane_f32_0:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_lane_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -3055,12 +2692,9 @@ entry:
 
 define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
 ; CHECK-LABEL: test_vmulxq_lane_f64_0:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_lane_f64_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -3069,12 +2703,9 @@ entry:
 
 define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
 ; CHECK-LABEL: test_vmulx_laneq_f32_0:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulx_laneq_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
-; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -3083,12 +2714,9 @@ entry:
 
 define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
 ; CHECK-LABEL: test_vmulxq_laneq_f32_0:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_laneq_f32_0:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
-; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
@@ -3097,12 +2725,9 @@ entry:
 
 define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmulxq_laneq_f64_0:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: test_vmulxq_laneq_f64_0:
-; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
-; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d
-; EXYNOS-NEXT: ret
+; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
+; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
@@ -3111,14 +2736,11 @@ entry:
 
 define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
 ; CHECK-LABEL: optimize_dup:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: optimize_dup:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
 entry:
   %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
@@ -3130,15 +2752,12 @@ entry:
 
 define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
 ; CHECK-LABEL: no_optimize_dup:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-; EXYNOS-LABEL: no_optimize_dup:
-; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
-; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s
-; EXYNOS: dup  [[y:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
-; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[y]].4s
-; EXYNOS-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
+; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
+; EXYNOSM1: dup  [[W:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
+; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[W]].4s
 entry:
   %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
@@ -3150,8 +2769,7 @@ entry:
 
 define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_a57(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="cortex-a57" {
 ; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -3160,9 +2778,8 @@ entry:
 
 define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m1(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m1" {
 ; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1:
-; CHECK: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s
-; CHECK-NEXT: ret
+; GENERIC: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
+; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
 entry:
   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
index 453334dce601..2fb9d3b2d030 100644
--- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -87,4 +87,13 @@ for.end:
   ret double %v0
 }
 
+define <2 x i64> @t6() {
+; ALL-LABEL: t6:
+; CYCLONE: movi.16b v0, #0
+; KRYO: movi v0.2d, #0000000000000000
+; FALKOR: movi v0.2d, #0000000000000000
+ ret <2 x i64> zeroinitializer
+}
+
+
 declare double @sin(double)
diff --git a/test/CodeGen/AArch64/chkstk.ll b/test/CodeGen/AArch64/chkstk.ll
new file mode 100644
index 000000000000..1c2e5528f10c
--- /dev/null
+++ b/test/CodeGen/AArch64/chkstk.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs %s -o - \
+; RUN:  | FileCheck -check-prefix CHECK-DEFAULT-CODE-MODEL %s
+
+; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs -code-model=large %s -o - \
+; RUN:  | FileCheck -check-prefix CHECK-LARGE-CODE-MODEL %s
+
+define void @check_watermark() {
+entry:
+  %buffer = alloca [4096 x i8], align 1
+  ret void
+}
+
+; CHECK-DEFAULT-CODE-MODEL: check_watermark:
+; CHECK-DEFAULT-CODE-MODEL-DAG: stp x29, x30, [sp
+; CHECK-DEFAULT-CODE-MODEL-DAG: orr x15, xzr, #0x100
+; CHECK-DEFAULT-CODE-MODEL:     bl __chkstk
+; CHECK-DEFAULT-CODE-MODEL:     sub sp, sp, x15, lsl #4
+
+; CHECK-LARGE-CODE-MODEL: check_watermark:
+; CHECK-LARGE-CODE-MODEL-DAG: stp x29, x30, [sp
+; CHECK-LARGE-CODE-MODEL-DAG: orr x15, xzr, #0x100
+; CHECK-LARGE-CODE-MODEL-DAG: adrp x16, __chkstk
+; CHECK-LARGE-CODE-MODEL-DAG: add x16, x16, __chkstk
+; CHECK-LARGE-CODE-MODEL:     blr x16
+; CHECK-LARGE-CODE-MODEL:     sub sp, sp, x15, lsl #4
diff --git a/test/CodeGen/AArch64/ldst-paired-aliasing.ll b/test/CodeGen/AArch64/ldst-paired-aliasing.ll
index 9c698b5fdcc6..9b0b51d369a3 100644
--- a/test/CodeGen/AArch64/ldst-paired-aliasing.ll
+++ b/test/CodeGen/AArch64/ldst-paired-aliasing.ll
@@ -10,11 +10,10 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
 define i32 @main() local_unnamed_addr #1 {
 ; Make sure the stores happen in the correct order (the exact instructions could change).
 ; CHECK-LABEL: main:
-; CHECK: stp xzr, xzr, [sp, #72]
+; CHECK: str xzr, [sp, #80]
 ; CHECK: str w9, [sp, #80]
-; CHECK: str q0, [sp, #48]
+; CHECK: stp q0, q0, [sp, #48]
 ; CHECK: ldr w8, [sp, #48]
-; CHECK: str q0, [sp, #64]
 
 for.body.lr.ph.i.i.i.i.i.i63:
   %b1 = alloca [10 x i32], align 16
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 228d3c7d4306..71c4c83c28f9 100644
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -251,7 +251,8 @@ entry:
 
 ; R600: MOVA_INT
 
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding:
+; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding:
+; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding:
 
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
diff --git a/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll
new file mode 100644
index 000000000000..f97785beab6f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple=amdgcn--amdhsa-amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Effectively, check that the compile finishes; in the case
+; of an infinite loop, llc toggles between merging 2 ST4s
+; ( MergeConsecutiveStores() ) and breaking the resulting ST8
+; apart ( LegalizeStoreOps() ).
+
+target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
+
+; GCN-LABEL: {{^}}_Z6brokenPd:
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}
+define amdgpu_kernel void @_Z6brokenPd(double* %arg) {
+bb:
+  %tmp = alloca double, align 8, addrspace(5)
+  %tmp1 = alloca double, align 8, addrspace(5)
+  %tmp2 = load double, double* %arg, align 8
+  br i1 1, label %bb6, label %bb4
+
+bb3:                                             ; No predecessors!
+  br label %bb4
+
+bb4:                                             ; preds = %bb3, %bb
+  %tmp5 = phi double addrspace(5)* [ %tmp1, %bb3 ], [ %tmp, %bb ]
+  store double %tmp2, double addrspace(5)* %tmp5, align 8
+  br label %bb6
+
+bb6:                                             ; preds = %bb4, %bb
+  %tmp7 = phi double [ 0x7FF8123000000000, %bb4 ], [ 0x7FF8000000000000, %bb ]
+  store double %tmp7, double* %arg, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
index 7c2666e3680f..8b9b83f6d0e7 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
@@ -6,6 +6,7 @@
   define void @test_trunc_and_zext_s16() { ret void }
   define void @test_trunc_and_anyext_s8() { ret void }
   define void @test_trunc_and_anyext_s16() { ret void }
+  define void @test_trunc_s64() #0 { ret void }
 
   define void @test_add_s32() { ret void }
   define void @test_add_fold_imm_s32() { ret void }
@@ -46,6 +47,10 @@
   define void @test_gep() { ret void }
   define void @test_constant_imm() { ret void }
   define void @test_constant_cimm() { ret void }
+  define void @test_pointer_constant() { ret void }
+
+  define void @test_inttoptr_s32() { ret void }
+  define void @test_ptrtoint_s32() { ret void }
 
   define void @test_select_s32() { ret void }
   define void @test_select_ptr() { ret void }
@@ -241,6 +246,36 @@ body:             |
     ; CHECK: BX_RET 14, %noreg, implicit %r0
 ...
 ---
+name:            test_trunc_s64
+# CHECK-LABEL: name: test_trunc_s64
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: fprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+body:             |
+  bb.0:
+    liveins: %r0, %d0
+
+    %0(s64) = COPY %d0
+    ; CHECK: [[VREG:%[0-9]+]]:dpr = COPY %d0
+
+    %2(p0) = COPY %r0
+    ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY %r0
+
+    %1(s32) = G_TRUNC %0(s64)
+    ; CHECK: [[VREGTRUNC:%[0-9]+]]:gpr, [[UNINTERESTING:%[0-9]+]]:gpr = VMOVRRD [[VREG]]
+
+    G_STORE %1(s32), %2 :: (store 4)
+    ; CHECK: STRi12 [[VREGTRUNC]], [[PTR]], 0, 14, %noreg
+
+    BX_RET 14, %noreg
+    ; CHECK: BX_RET 14, %noreg
+...
+---
 name:            test_add_s32
 # CHECK-LABEL: name: test_add_s32
 legalized:       true
@@ -1075,6 +1110,71 @@ body:             |
     BX_RET 14, %noreg, implicit %r0
 ...
 ---
+name:            test_pointer_constant
+# CHECK-LABEL: name: test_pointer_constant
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+body:             |
+  bb.0:
+    %0(p0) = G_CONSTANT i32 0
+    ; CHECK: %[[C:[0-9]+]]:gpr = MOVi 0, 14, %noreg, %noreg
+
+    %r0 = COPY %0(p0)
+    BX_RET 14, %noreg, implicit %r0
+...
+---
+name:            test_inttoptr_s32
+# CHECK-LABEL: name: test_inttoptr_s32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+body:             |
+  bb.0:
+    liveins: %r0
+
+    %0(s32) = COPY %r0
+    %1(p0) = G_INTTOPTR %0(s32)
+    ; CHECK: [[INT:%[0-9]+]]:gpr = COPY %r0
+    ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY [[INT]]
+
+    %r0 = COPY %1(p0)
+    ; CHECK: %r0 = COPY [[PTR]]
+
+    BX_RET 14, %noreg, implicit %r0
+...
+---
+name:            test_ptrtoint_s32
+# CHECK-LABEL: name: test_ptrtoint_s32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+body:             |
+  bb.0:
+    liveins: %r0
+
+    %0(p0) = COPY %r0
+    %1(s32) = G_PTRTOINT %0(p0)
+    ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY %r0
+    ; CHECK: [[INT:%[0-9]+]]:gpr = COPY [[PTR]]
+
+    %r0 = COPY %1(s32)
+    ; CHECK: %r0 = COPY [[INT]]
+
+    BX_RET 14, %noreg, implicit %r0
+...
+---
 name:            test_select_s32
 # CHECK-LABEL: name: test_select_s32
 legalized:       true
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index e3e206cf76e9..204434e981b4 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -3,6 +3,9 @@
   define void @test_sext_s8() { ret void }
   define void @test_zext_s16() { ret void }
 
+  define void @test_inttoptr_s32() { ret void }
+  define void @test_ptrtoint_s32() { ret void }
+
   define void @test_add_s8() { ret void }
   define void @test_add_s16() { ret void }
   define void @test_add_s32() { ret void }
@@ -101,6 +104,50 @@ body:             |
     BX_RET 14, %noreg, implicit %r0
 ...
 ---
+name:            test_inttoptr_s32
+# CHECK-LABEL: name: test_inttoptr_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0
+
+    %0(s32) = COPY %r0
+    %1(p0) = G_INTTOPTR %0(s32)
+    ; G_INTTOPTR with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(p0) = G_INTTOPTR {{%[0-9]+}}
+    %r0 = COPY %1(p0)
+    BX_RET 14, %noreg, implicit %r0
+...
+---
+name:            test_ptrtoint_s32
+# CHECK-LABEL: name: test_ptrtoint_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0
+
+    %0(p0) = COPY %r0
+    %1(s32) = G_PTRTOINT %0(p0)
+    ; G_PTRTOINT with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}:_(s32) = G_PTRTOINT {{%[0-9]+}}
+    %r0 = COPY %1(s32)
+    BX_RET 14, %noreg, implicit %r0
+...
+---
 name:            test_add_s8
 # CHECK-LABEL: name: test_add_s8
 legalized:       false
@@ -826,6 +873,7 @@ registers:
   - { id: 2, class: _ }
   - { id: 3, class: _ }
   - { id: 4, class: _ }
+  - { id: 5, class: _ }
 body:             |
   bb.0:
     liveins: %r0
@@ -856,6 +904,10 @@ body:             |
     ; CHECK: {{%[0-9]+}}:_(s1) = G_TRUNC [[EXT]](s32)
     ; CHECK-NOT: G_CONSTANT i1
 
+    %5(p0) = G_CONSTANT 0
+    G_STORE %5(p0), %4(p0) :: (store 4)
+    ; CHECK: {{%[0-9]+}}:_(p0) = G_CONSTANT 0
+
     %r0 = COPY %0(s32)
     BX_RET 14, %noreg, implicit %r0
 ...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index 044740e33a2d..175333626f97 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -24,6 +24,9 @@
 
   define void @test_constants() { ret void }
 
+  define void @test_inttoptr_s32() { ret void }
+  define void @test_ptrtoint_s32() { ret void }
+
   @a_global = global float 1.0
   define void @test_globals() { ret void }
 
@@ -31,6 +34,7 @@
   define void @test_anyext_s16_32() { ret void }
 
   define void @test_trunc_s32_16() { ret void }
+  define void @test_trunc_s64_32() #0 { ret void }
 
   define void @test_icmp_eq_s32() { ret void }
   define void @test_fcmp_one_s32() #0 { ret void }
@@ -496,6 +500,44 @@ body:             |
     BX_RET 14, %noreg, implicit %r0
 ...
 ---
+name:            test_inttoptr_s32
+# CHECK-LABEL: name: test_inttoptr_s32
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb, preferred-register: '' }
+# CHECK: - { id: 1, class: gprb, preferred-register: '' }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    %0(s32) = COPY %r0
+    %1(p0) = G_INTTOPTR %0(s32)
+    %r0 = COPY %1(p0)
+    BX_RET 14, %noreg, implicit %r0
+...
+---
+name:            test_ptrtoint_s32
+# CHECK-LABEL: name: test_ptrtoint_s32
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb, preferred-register: '' }
+# CHECK: - { id: 1, class: gprb, preferred-register: '' }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.0:
+    %0(p0) = COPY %r0
+    %1(s32) = G_PTRTOINT %0(p0)
+    %r0 = COPY %1(s32)
+    BX_RET 14, %noreg, implicit %r0
+...
+---
 name:            test_globals
 # CHECK-LABEL: name: test_globals
 legalized:       true
@@ -584,6 +626,30 @@ body:             |
     BX_RET 14, %noreg
 ...
 ---
+name:            test_trunc_s64_32
+# CHECK-LABEL: name: test_trunc_s64_32
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: fprb, preferred-register: '' }
+# CHECK: - { id: 1, class: gprb, preferred-register: '' }
+# CHECK: - { id: 2, class: gprb, preferred-register: '' }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %d0
+
+    %0(s64) = COPY %d0
+    %2(p0) = COPY %r0
+    %1(s32) = G_TRUNC %0(s64)
+    G_STORE %1(s32), %2 :: (store 4)
+    BX_RET 14, %noreg
+...
+---
 name:            test_icmp_eq_s32
 # CHECK-LABEL: name: test_icmp_eq_s32
 legalized:       true
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
index 78d3ebf371a4..9373c5d44210 100644
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 -simplifycfg-sink-common=false | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     -simplifycfg-sink-common=false | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
 ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
 ; dependency) when it isn't dependent on last CPSR defining instruction.
 ; rdar://8928208
diff --git a/test/CodeGen/ARM/su-addsub-overflow.ll b/test/CodeGen/ARM/su-addsub-overflow.ll
new file mode 100644
index 000000000000..eef531282033
--- /dev/null
+++ b/test/CodeGen/ARM/su-addsub-overflow.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -mtriple=arm-eabi -mcpu=generic | FileCheck %s
+
+define i32 @sadd(i32 %a, i32 %b) local_unnamed_addr #0 {
+; CHECK-LABEL: sadd:
+; CHECK:    mov r[[R0:[0-9]+]], r0
+; CHECK-NEXT:    add r[[R1:[0-9]+]], r[[R0]], r1
+; CHECK-NEXT:    cmp r[[R1]], r[[R0]]
+; CHECK-NEXT:    movvc pc, lr
+entry:
+  %0 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %0, 1
+  br i1 %1, label %trap, label %cont
+
+trap:
+  tail call void @llvm.trap() #2
+  unreachable
+
+cont:
+  %2 = extractvalue { i32, i1 } %0, 0
+  ret i32 %2
+
+}
+
+define i32 @uadd(i32 %a, i32 %b) local_unnamed_addr #0 {
+; CHECK-LABEL: uadd:
+; CHECK:    mov r[[R0:[0-9]+]], r0
+; CHECK-NEXT:    adds r[[R1:[0-9]+]], r[[R0]], r1
+; CHECK-NEXT:    cmp r[[R1]], r[[R0]]
+; CHECK-NEXT:    movhs pc, lr
+entry:
+  %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %0, 1
+  br i1 %1, label %trap, label %cont
+
+trap:
+  tail call void @llvm.trap() #2
+  unreachable
+
+cont:
+  %2 = extractvalue { i32, i1 } %0, 0
+  ret i32 %2
+
+}
+
+define i32 @ssub(i32 %a, i32 %b) local_unnamed_addr #0 {
+; CHECK-LABEL: ssub:
+; CHECK:    cmp r0, r1
+; CHECK-NEXT:    subvc r0, r0, r1
+; CHECK-NEXT:    movvc pc, lr
+entry:
+  %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %0, 1
+  br i1 %1, label %trap, label %cont
+
+trap:
+  tail call void @llvm.trap() #2
+  unreachable
+
+cont:
+  %2 = extractvalue { i32, i1 } %0, 0
+  ret i32 %2
+
+}
+
+define i32 @usub(i32 %a, i32 %b) local_unnamed_addr #0 {
+; CHECK-LABEL: usub:
+; CHECK:    mov r[[R0:[0-9]+]], r0
+; CHECK-NEXT:    subs r[[R1:[0-9]+]], r[[R0]], r1
+; CHECK-NEXT:    cmp r[[R0]], r1
+; CHECK-NEXT:    movhs pc, lr
+entry:
+  %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %0, 1
+  br i1 %1, label %trap, label %cont
+
+trap:
+  tail call void @llvm.trap() #2
+  unreachable
+
+cont:
+  %2 = extractvalue { i32, i1 } %0, 0
+  ret i32 %2
+
+}
+
+define void @sum(i32* %a, i32* %b, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: sum:
+; CHECK:    ldr [[R0:r[0-9]+]],
+; CHECK-NEXT:    ldr [[R1:r[0-9]+|lr]],
+; CHECK-NEXT:    add [[R2:r[0-9]+]], [[R1]], [[R0]]
+; CHECK-NEXT:    cmp [[R2]], [[R1]]
+; CHECK-NEXT:    strvc [[R2]],
+; CHECK-NEXT:    addvc
+; CHECK-NEXT:    cmpvc
+; CHECK-NEXT:    bvs
+entry:
+  %cmp7 = icmp eq i32 %n, 0
+  br i1 %cmp7, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.08 = phi i32 [ %7, %cont2 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.08
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.08
+  %1 = load i32, i32* %arrayidx1, align 4
+  %2 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %1, i32 %0)
+  %3 = extractvalue { i32, i1 } %2, 1
+  br i1 %3, label %trap, label %cont
+
+trap:
+  tail call void @llvm.trap() #2
+  unreachable
+
+cont:
+  %4 = extractvalue { i32, i1 } %2, 0
+  store i32 %4, i32* %arrayidx1, align 4
+  %5 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %i.08, i32 1)
+  %6 = extractvalue { i32, i1 } %5, 1
+  br i1 %6, label %trap, label %cont2
+
+cont2:
+  %7 = extractvalue { i32, i1 } %5, 0
+  %cmp = icmp eq i32 %7, %n
+  br i1 %cmp, label %for.cond.cleanup, label %for.body
+
+}
+
+declare void @llvm.trap() #2
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #1
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
+declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) #1
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
diff --git a/test/CodeGen/ARM/usat.ll b/test/CodeGen/ARM/usat.ll
new file mode 100644
index 000000000000..8f19d11ef7bb
--- /dev/null
+++ b/test/CodeGen/ARM/usat.ll
@@ -0,0 +1,214 @@
+; RUN: llc -mtriple=armv4t-eabi %s -o -  | FileCheck %s --check-prefix=CHECK --check-prefix=V4T
+; RUN: llc -mtriple=armv6-eabi %s -o -   | FileCheck %s --check-prefix=CHECK --check-prefix=V6
+; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6T2
+
+; Check for several conditions that should result in USAT.
+; For example, the base test is equivalent to
+; x < 0 ? 0 : (x > k ? k : x) in C. All patterns that bound x
+; to the interval [0, k] where k + 1 is a power of 2 can be
+; transformed into USAT. At the end there are some tests
+; checking that conditionals are not transformed if they don't
+; match the right pattern.
+
+;
+; Base tests with different bit widths
+;
+
+; x < 0 ? 0 : (x > k ? k : x)
+; 32-bit base test
+define i32 @unsigned_sat_base_32bit(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_base_32bit:
+; V6: usat r0, #23, r0
+; V6T2: usat r0, #23, r0
+; V4T-NOT: usat
+entry:
+  %cmpLow = icmp slt i32 %x, 0
+  %cmpUp = icmp sgt i32 %x, 8388607
+  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x
+  %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp
+  ret i32 %saturateLow
+}
+
+; x < 0 ? 0 : (x > k ? k : x)
+; 16-bit base test
+define i16 @unsigned_sat_base_16bit(i16 %x) #0 {
+; CHECK-LABEL: unsigned_sat_base_16bit:
+; V6: usat r0, #11, r0
+; V6T2: usat r0, #11, r0
+; V4T-NOT: usat
+entry:
+  %cmpLow = icmp slt i16 %x, 0
+  %cmpUp = icmp sgt i16 %x, 2047
+  %saturateUp = select i1 %cmpUp, i16 2047, i16 %x
+  %saturateLow = select i1 %cmpLow, i16 0, i16 %saturateUp
+  ret i16 %saturateLow
+}
+
+; x < 0 ? 0 : (x > k ? k : x)
+; 8-bit base test
+define i8 @unsigned_sat_base_8bit(i8 %x) #0 {
+; CHECK-LABEL: unsigned_sat_base_8bit:
+; V6: usat r0, #5, r0
+; V6T2: usat r0, #5, r0
+; V4T-NOT: usat
+entry:
+  %cmpLow = icmp slt i8 %x, 0
+  %cmpUp = icmp sgt i8 %x, 31
+  %saturateUp = select i1 %cmpUp, i8 31, i8 %x
+  %saturateLow = select i1 %cmpLow, i8 0, i8 %saturateUp
+  ret i8 %saturateLow
+}
+
+;
+; Tests where the conditionals that check for upper and lower bounds,
+; or the < and > operators, are arranged in different ways. Only some
+; of the possible combinations that lead to USAT are tested.
+;
+; x < 0 ? 0 : (x < k ? x : k)
+define i32 @unsigned_sat_lower_upper_1(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_lower_upper_1:
+; V6: usat r0, #23, r0
+; V6T2: usat r0, #23, r0
+; V4T-NOT: usat
+entry:
+  %cmpLow = icmp slt i32 %x, 0
+  %cmpUp = icmp slt i32 %x, 8388607
+  %saturateUp = select i1 %cmpUp, i32 %x, i32 8388607
+  %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp
+  ret i32 %saturateLow
+}
+
+; x > 0 ? (x > k ? k : x) : 0
+define i32 @unsigned_sat_lower_upper_2(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_lower_upper_2:
+; V6: usat    r0, #23, r0
+; V6T2: usat    r0, #23, r0
+; V4T-NOT: usat
+entry:
+  %cmpLow = icmp sgt i32 %x, 0
+  %cmpUp = icmp sgt i32 %x, 8388607
+  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x
+  %saturateLow = select i1 %cmpLow, i32 %saturateUp, i32 0
+  ret i32 %saturateLow
+}
+
+; x < k ? (x < 0 ? 0 : x) : k
+define i32 @unsigned_sat_upper_lower_1(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_upper_lower_1:
+; V6: usat    r0, #23, r0
+; V6T2: usat    r0, #23, r0
+; V4T-NOT: usat
+entry:
+  %cmpUp = icmp slt i32 %x, 8388607
+  %cmpLow = icmp slt i32 %x, 0
+  %saturateLow = select i1 %cmpLow, i32 0, i32 %x
+  %saturateUp = select i1 %cmpUp, i32 %saturateLow, i32 8388607
+  ret i32 %saturateUp
+}
+
+; x > k ? k : (x < 0 ? 0 : x)
+define i32 @unsigned_sat_upper_lower_2(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_upper_lower_2:
+; V6: usat    r0, #23, r0
+; V6T2: usat    r0, #23, r0
+; V4T-NOT: usat
+entry:
+  %cmpUp = icmp sgt i32 %x, 8388607
+  %cmpLow = icmp slt i32 %x, 0
+  %saturateLow = select i1 %cmpLow, i32 0, i32 %x
+  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+  ret i32 %saturateUp
+}
+
+; k < x ? k : (x > 0 ? x : 0)
+define i32 @unsigned_sat_upper_lower_3(i32 %x) #0 {
+; CHECK-LABEL: unsigned_sat_upper_lower_3:
+; V6: usat    r0, #23, r0
+; V6T2: usat    r0, #23, r0
+; V4T-NOT: usat
+entry:
+  %cmpUp = icmp slt i32 8388607, %x
+  %cmpLow = icmp sgt i32 %x, 0
+  %saturateLow = select i1 %cmpLow, i32 %x, i32 0
+  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+  ret i32 %saturateUp
+}
+
+;
+; The following tests check for patterns that should not transform
+; into USAT but are similar enough that could confuse the selector.
+;
+; x > k ? k : (x > 0 ? 0 : x)
+; First condition upper-saturates, second doesn't lower-saturate.
+define i32 @no_unsigned_sat_missing_lower(i32 %x) #0 {
+; CHECK-LABEL: no_unsigned_sat_missing_lower
+; CHECK-NOT: usat
+entry:
+  %cmpUp = icmp sgt i32 %x, 8388607
+  %cmpLow = icmp sgt i32 %x, 0
+  %saturateLow = select i1 %cmpLow, i32 0, i32 %x
+  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+  ret i32 %saturateUp
+}
+
+; x < k ? k : (x < 0 ? 0 : x)
+; Second condition lower-saturates, first doesn't upper-saturate.
+define i32 @no_unsigned_sat_missing_upper(i32 %x) #0 {
+; CHECK-LABEL: no_unsigned_sat_missing_upper:
+; CHECK-NOT: usat
+entry:
+  %cmpUp = icmp slt i32 %x, 8388607
+  %cmpLow = icmp slt i32 %x, 0
+  %saturateLow = select i1 %cmpLow, i32 0, i32 %x
+  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+  ret i32 %saturateUp
+}
+
+; Lower constant is different in the select and in the compare
+define i32 @no_unsigned_sat_incorrect_constant(i32 %x) #0 {
+; CHECK-LABEL: no_unsigned_sat_incorrect_constant:
+; CHECK-NOT: usat
+entry:
+  %cmpUp = icmp sgt i32 %x, 8388607
+  %cmpLow = icmp slt i32 %x, 0
+  %saturateLow = select i1 %cmpLow, i32 -1, i32 %x
+  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+  ret i32 %saturateUp
+}
+
+; The interval is not [0, k]
+define i32 @no_unsigned_sat_incorrect_interval(i32 %x) #0 {
+; CHECK-LABEL: no_unsigned_sat_incorrect_interval:
+; CHECK-NOT: usat
+entry:
+  %cmpUp = icmp sgt i32 %x, 8388607
+  %cmpLow = icmp slt i32 %x, -4
+  %saturateLow = select i1 %cmpLow, i32 -4, i32 %x
+  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+  ret i32 %saturateUp
+}
+
+; The returned value (y) is not the same as the tested value (x).
+define i32 @no_unsigned_sat_incorrect_return(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: no_unsigned_sat_incorrect_return:
+; CHECK-NOT: usat
+entry:
+  %cmpUp = icmp sgt i32 %x, 8388607
+  %cmpLow = icmp slt i32 %x, 0
+  %saturateLow = select i1 %cmpLow, i32 0, i32 %y
+  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+  ret i32 %saturateUp
+}
+
+; One of the values in a compare (y) is not the same as the rest
+; of the compare and select values (x).
+define i32 @no_unsigned_sat_incorrect_compare(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: no_unsigned_sat_incorrect_compare:
+; CHECK-NOT: usat
+entry:
+  %cmpUp = icmp sgt i32 %x, 8388607
+  %cmpLow = icmp slt i32 %y, 0
+  %saturateLow = select i1 %cmpLow, i32 0, i32 %x
+  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+  ret i32 %saturateUp
+}
diff --git a/test/CodeGen/BPF/objdump_imm_hex.ll b/test/CodeGen/BPF/objdump_imm_hex.ll
new file mode 100644
index 000000000000..a245a6c791f2
--- /dev/null
+++ b/test/CodeGen/BPF/objdump_imm_hex.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-DEC %s
+; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-objdump -d -print-imm-hex - | FileCheck --check-prefix=CHECK-HEX %s
+
+; Source Code:
+; int gbl;
+; int test(unsigned long long a, unsigned long long b) {
+;   int ret = 0;
+;   if (a == 0xABCDABCDabcdabcdULL) {
+;     gbl = gbl * gbl * 2;
+;     ret = 1;
+;     goto out;
+;   }
+;   if (b == 0xABCDabcdabcdULL) {
+;     gbl = gbl * 4;
+;     ret = 2;
+;   }
+;  out:
+;   return ret;
+; }
+
+@gbl = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind
+define i32 @test(i64, i64) local_unnamed_addr #0 {
+; CHECK-LABEL: test
+  %3 = icmp eq i64 %0, -6067004223159161907
+  br i1 %3, label %4, label %8
+; CHECK-DEC: 18 03 00 00 cd ab cd ab 00 00 00 00 cd ab cd ab         r3 = -6067004223159161907 ll
+; CHECK-DEC: 5d 31 07 00 00 00 00 00         if r1 != r3 goto +7
+; CHECK-HEX: 18 03 00 00 cd ab cd ab 00 00 00 00 cd ab cd ab         r3 = -0x5432543254325433 ll
+; CHECK-HEX: 5d 31 07 00 00 00 00 00         if r1 != r3 goto +0x7
+
+; <label>:4:                                      ; preds = %2
+  %5 = load i32, i32* @gbl, align 4
+  %6 = shl i32 %5, 1
+; CHECK-DEC: 67 01 00 00 01 00 00 00         r1 <<= 1
+; CHECK-HEX: 67 01 00 00 01 00 00 00         r1 <<= 0x1
+  %7 = mul i32 %6, %5
+  br label %13
+
+; <label>:8:                                      ; preds = %2
+  %9 = icmp eq i64 %1, 188899839028173
+; CHECK-DEC: 18 01 00 00 cd ab cd ab 00 00 00 00 cd ab 00 00         r1 = 188899839028173 ll
+; CHECK-HEX: 18 01 00 00 cd ab cd ab 00 00 00 00 cd ab 00 00         r1 = 0xabcdabcdabcd ll
+  br i1 %9, label %10, label %16
+
+; <label>:10:                                     ; preds = %8
+  %11 = load i32, i32* @gbl, align 4
+  %12 = shl nsw i32 %11, 2
+  br label %13
+
+; <label>:13:                                     ; preds = %4, %10
+  %14 = phi i32 [ %12, %10 ], [ %7, %4 ]
+  %15 = phi i32 [ 2, %10 ], [ 1, %4 ]
+  store i32 %14, i32* @gbl, align 4
+; CHECK-DEC: 63 12 00 00 00 00 00 00         *(u32 *)(r2 + 0) = r1
+; CHECK-HEX: 63 12 00 00 00 00 00 00         *(u32 *)(r2 + 0x0) = r1
+  br label %16
+
+; <label>:16:                                     ; preds = %13, %8
+  %17 = phi i32 [ 0, %8 ], [ %15, %13 ]
+  ret i32 %17
+}
+
+attributes #0 = { norecurse nounwind }
diff --git a/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll b/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll
new file mode 100644
index 000000000000..f96dbf2af496
--- /dev/null
+++ b/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that this doesn't crash.
+; CHECK: sfcmp
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define void @fred() #0 {
+b0:
+  %v1 = load <16 x float>, <16 x float>* null, align 8
+  %v2 = fcmp olt <16 x float> undef, %v1
+  %v3 = select <16 x i1> %v2, <16 x i16> undef, <16 x i16> zeroinitializer
+  %v4 = sext <16 x i16> %v3 to <16 x i32>
+  store <16 x i32> %v4, <16 x i32>* undef, align 64
+  unreachable
+}
+
+attributes #0 = { noinline norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length64b,+hvxv60" }
diff --git a/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll b/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll
new file mode 100644
index 000000000000..4cbd00837fc6
--- /dev/null
+++ b/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that this testcase doesn't crash.
+; CHECK: sfcmp
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define void @fred() #0 {
+b0:
+  %v1 = fcmp olt <16 x float> zeroinitializer, undef
+  %v2 = select <16 x i1> %v1, <16 x i16> undef, <16 x i16> zeroinitializer
+  %v3 = sext <16 x i16> %v2 to <16 x i32>
+  store <16 x i32> %v3, <16 x i32>* undef, align 128
+  unreachable
+}
+
+attributes #0 = { noinline norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length64b" }
diff --git a/test/CodeGen/Hexagon/autohvx/isel-select-const.ll b/test/CodeGen/Hexagon/autohvx/isel-select-const.ll
new file mode 100644
index 000000000000..c251292c9da4
--- /dev/null
+++ b/test/CodeGen/Hexagon/autohvx/isel-select-const.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that this doesn't crash.
+; CHECK: vlut32
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown--elf"
+
+define void @fred() #0 {
+b0:
+  %v1 = tail call <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32> undef, <16 x i32> <i32 151388928, i32 353505036, i32 555621144, i32 757737252, i32 959853360, i32 1161969468, i32 1364085576, i32 1566201684, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> undef, i32 3)
+  %v2 = bitcast <16 x i32> %v1 to <64 x i8>
+  %v3 = shufflevector <64 x i8> %v2, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %v4 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %v3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %v5 = bitcast <64 x i8> %v4 to <16 x i32>
+  %v6 = tail call <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32> %v5)
+  store <16 x i32> %v6, <16 x i32>* undef, align 1
+  %v7 = tail call <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32> undef, <16 x i32> <i32 151388928, i32 353505036, i32 555621144, i32 757737252, i32 959853360, i32 1161969468, i32 1364085576, i32 1566201684, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> zeroinitializer, i32 3)
+  %v8 = bitcast <16 x i32> %v7 to <64 x i8>
+  %v9 = shufflevector <64 x i8> %v8, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %v10 = shufflevector <32 x i8> %v9, <32 x i8> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %v11 = bitcast <64 x i8> %v10 to <16 x i32>
+  %v12 = tail call <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32> %v11)
+  store <16 x i32> %v12, <16 x i32>* undef, align 1
+  unreachable
+}
+
+declare <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32>) #1
+declare <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length64b" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/expand-vstorerw-undef.ll b/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
index 88eaec938fd3..5ac0f59bd2d1 100644
--- a/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
+++ b/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
@@ -12,7 +12,7 @@
 
 ; CHECK-LABEL: fred:
 ; CHECK: v[[REG:[0-9]+]] = vsplat
-; CHECK: vmem(r29+#6) = v[[REG]]
+; CHECK: vmem(r29+#{{[0-9]+}}) = v[[REG]]
 
 
 target triple = "hexagon"
diff --git a/test/CodeGen/Hexagon/v60-cur.ll b/test/CodeGen/Hexagon/v60-cur.ll
index 26d40c9a6975..d0ffe1d8fdd8 100644
--- a/test/CodeGen/Hexagon/v60-cur.ll
+++ b/test/CodeGen/Hexagon/v60-cur.ll
@@ -1,9 +1,8 @@
-; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; Test that we generate a .cur
 
-; CHECK: v{{[0-9]*}}.cur{{ *}}
-; CHECK: v{{[0-9]*}}.cur{{ *}}
+; CHECK: v{{[0-9]*}}.cur
 
 define void @conv3x3_i(i8* noalias nocapture readonly %iptr0, i32 %shift, i32 %width) #0 {
 entry:
diff --git a/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll b/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll
new file mode 100644
index 000000000000..af2a55ea47d5
--- /dev/null
+++ b/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=hexagon -debug-only=isel < %s 2>/dev/null
+; REQUIRES: asserts
+
+; Make sure that this doesn't crash. Debug option enabled a failing assertion
+; about type mismatch in formal arguments.
+; CHECK: vaddub
+
+define i1 @t_i4x8(<4 x i8> %a, <4 x i8> %b) nounwind {
+entry:
+	%0 = add <4 x i8> %a, %b
+        %1 = bitcast <4 x i8> %0 to <32 x i1>
+        %2 = extractelement <32 x i1> %1, i32 0
+	ret i1 %2
+}
diff --git a/test/CodeGen/Hexagon/vect/vect-infloop.ll b/test/CodeGen/Hexagon/vect/vect-infloop.ll
index 4de390159fdd..9ee0b0ab3aa6 100644
--- a/test/CodeGen/Hexagon/vect/vect-infloop.ll
+++ b/test/CodeGen/Hexagon/vect/vect-infloop.ll
@@ -1,10 +1,10 @@
 ; Extracted from test/CodeGen/Generic/vector-casts.ll: used to loop indefinitely.
 ; RUN: llc -march=hexagon < %s | FileCheck %s
-; CHECK: combine
+; CHECK: convert_df2w
 
 define void @a(<2 x double>* %p, <2 x i8>* %q) {
-        %t = load <2 x double>, <2 x double>* %p
-	%r = fptosi <2 x double> %t to <2 x i8>
-        store <2 x i8> %r, <2 x i8>* %q
-	ret void
+  %t = load <2 x double>, <2 x double>* %p
+  %r = fptosi <2 x double> %t to <2 x i8>
+  store <2 x i8> %r, <2 x i8>* %q
+  ret void
 }
diff --git a/test/CodeGen/Mips/llvm-ir/extractelement.ll b/test/CodeGen/Mips/llvm-ir/extractelement.ll
index f7b8ea5f9e15..4f926cbee0b2 100644
--- a/test/CodeGen/Mips/llvm-ir/extractelement.ll
+++ b/test/CodeGen/Mips/llvm-ir/extractelement.ll
@@ -15,5 +15,5 @@ define i1 @via_stack_bug(i8 signext %idx) {
 ; ALL-DAG:       sh     [[ONE]], 6($sp)
 ; ALL-DAG:       andi   [[MASKED_IDX:\$[0-9]+]], $4, 1
 ; ALL-DAG:       addiu  [[VPTR:\$[0-9]+]], $sp, 6
-; ALL-DAG:       or   [[EPTR:\$[0-9]+]], [[MASKED_IDX]], [[VPTR]]
+; ALL-DAG:       or   [[EPTR:\$[0-9]+]], [[VPTR]], [[MASKED_IDX]]
 ; ALL:           lbu    $2, 0([[EPTR]])
diff --git a/test/CodeGen/Mips/long-call-mcount.ll b/test/CodeGen/Mips/long-call-mcount.ll
new file mode 100644
index 000000000000..70a4410d060b
--- /dev/null
+++ b/test/CodeGen/Mips/long-call-mcount.ll
@@ -0,0 +1,19 @@
+; Check call to mcount in case of long/short call options.
+; RUN: llc -march=mips -target-abi o32 --mattr=+long-calls,+noabicalls < %s \
+; RUN:   | FileCheck -check-prefixes=CHECK,LONG %s
+; RUN: llc -march=mips -target-abi o32 --mattr=-long-calls,+noabicalls < %s \
+; RUN:   | FileCheck -check-prefixes=CHECK,SHORT %s
+
+; Function Attrs: noinline nounwind optnone
+define void @foo() #0 {
+entry:
+  ret void
+
+; CHECK-LABEL: foo
+; LONG:          lui     $1, %hi(_mcount)
+; LONG-NEXT:     addiu   $25, $1, %lo(_mcount)
+; LONG-NEXT:     jalr    $25
+; SHORT:         jal     _mcount
+}
+
+attributes #0 = { "instrument-function-entry-inlined"="_mcount" }
diff --git a/test/CodeGen/Mips/sll-micromips-r6-encoding.mir b/test/CodeGen/Mips/sll-micromips-r6-encoding.mir
new file mode 100644
index 000000000000..85ce251ac315
--- /dev/null
+++ b/test/CodeGen/Mips/sll-micromips-r6-encoding.mir
@@ -0,0 +1,46 @@
+# RUN: llc -march=mips -mcpu=mips32r6 -mattr=+micromips %s -start-after=xray-instrumentation -o - -show-mc-encoding | FileCheck %s
+
+# Test that the 'sll $zero, $zero, 0' is correctly recognized as a real
+# instruction rather than some unimplemented opcode for the purposes of
+# encoding an instruction.
+
+# CHECK-LABEL: a:
+# CHECK:  nop                           # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:  jrc   $ra                     # encoding: [0x45,0xbf]
+---
+name:            a
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: false
+registers:
+liveins:
+  - { reg: '%a0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    renamable %zero = SLL_MMR6 killed renamable %zero, 0
+    JRC16_MM undef %ra, implicit %v0
+
+...
diff --git a/test/CodeGen/PowerPC/cmp_elimination.ll b/test/CodeGen/PowerPC/cmp_elimination.ll
index 3251ae2881b9..6bc8b8a041c2 100644
--- a/test/CodeGen/PowerPC/cmp_elimination.ll
+++ b/test/CodeGen/PowerPC/cmp_elimination.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
 
@@ -748,6 +747,37 @@ do.end:
   ret void
 }
 
+define void @func29(i32 signext %a) {
+; We cannot merge two compares due to difference in sign extension behaviors.
+; equivalent C code example:
+;   int a = .. ;
+;   if (a == -1) dummy1();
+;   if (a == (uint16_t)-1) dummy2();
+
+; CHECK-LABEL: @func29
+; CHECK: cmp
+; CHECK: cmp
+; CHECK: blr
+entry:
+  %cmp = icmp eq i32 %a, -1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  tail call void @dummy1()
+  br label %if.end3
+
+if.else:
+  %cmp1 = icmp eq i32 %a, 65535
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:
+  tail call void @dummy2()
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
 declare void @dummy1()
 declare void @dummy2()
 declare void @dummy3()
diff --git a/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll b/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll
new file mode 100644
index 000000000000..ad8dd90ea920
--- /dev/null
+++ b/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll
@@ -0,0 +1,15 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 \
+; RUN:   -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+; Ensure we don't crash by trying to convert directly from a subword load
+; to a ppc_fp128 as we do for conversions to f32/f64.
+define ppc_fp128 @test(i16* nocapture readonly %Ptr) {
+entry:
+  %0 = load i16, i16* %Ptr, align 2
+  %conv = uitofp i16 %0 to ppc_fp128
+  ret ppc_fp128 %conv
+; CHECK: lhz [[LD:[0-9]+]], 0(3)
+; CHECK: mtvsrwa [[MV:[0-9]+]], [[LD]]
+; CHECK: xscvsxddp [[CONV:[0-9]+]], [[MV]]
+; CHECK: bl __gcc_qadd
+}
diff --git a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
index 82c6c318abdc..247961e85b12 100644
--- a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
+++ b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
@@ -25,7 +25,7 @@ entry:
 ; CHECK: extsw 3, [[RSHREG]]
 ; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29
 ; CHECK-P7-DAG: stxvw4x 34,
-; CHECK-P7: lwax 3, [[ELEMOFFREG]],
+; CHECK-P7: lwax 3, 3, [[ELEMOFFREG]]
 ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2
 ; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 2
 ; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
@@ -54,7 +54,7 @@ entry:
 ; CHECK: mfvsrd 3,
 ; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 3, 28, 28
 ; CHECK-P7-DAG: stxvd2x 34,
-; CHECK-P7: ldx 3, [[ELEMOFFREG]],
+; CHECK-P7: ldx 3, 3, [[ELEMOFFREG]]
 ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
 ; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
 ; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
@@ -77,7 +77,7 @@ entry:
 ; CHECK: xscvspdpn 1,
 ; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29
 ; CHECK-P7-DAG: stxvw4x 34,
-; CHECK-P7: lfsx 1, [[ELEMOFFREG]],
+; CHECK-P7: lfsx 1, 3, [[ELEMOFFREG]]
 ; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2
 ; CHECK-BE: lvsl [[SHMSKREG:[0-9]+]], 0, [[ELNOREG]]
 ; CHECK-BE: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
diff --git a/test/CodeGen/Thumb2/t2sizereduction.mir b/test/CodeGen/Thumb2/t2sizereduction.mir
new file mode 100644
index 000000000000..377c0ccc7b0a
--- /dev/null
+++ b/test/CodeGen/Thumb2/t2sizereduction.mir
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=t2-reduce-size %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8m.main-arm-none-eabi"
+
+  ; Function Attrs: norecurse nounwind readnone
+  define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 {
+  entry:
+    %cmp6 = icmp sgt i32 %y, 0
+    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    %sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ]
+    ret i32 %sum.0.lcssa
+
+  for.body:                                         ; preds = %for.body, %for.body.preheader
+    %lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ]
+    %sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
+    %mul = mul nsw i32 %lsr.iv1, %sum.07
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %lsr.iv.next2 = add i32 %lsr.iv1, 1
+    %exitcond = icmp eq i32 %lsr.iv.next, 0
+    br i1 %exitcond, label %for.cond.cleanup, label %for.body
+  }
+
+  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="+d16,+dsp,+fp-armv8,+fp-only-sp,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0', virtual-reg: '' }
+  - { reg: '%r1', virtual-reg: '' }
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK:   liveins: %r0, %r1
+  ; CHECK:   %r2 = tMOVr %r0, 14, %noreg
+  ; CHECK:   %r0, dead %cpsr = tMOVi8 1, 14, %noreg
+  ; CHECK:   tCMPi8 %r1, 1, 14, %noreg, implicit-def %cpsr
+  ; CHECK:   t2Bcc %bb.2, 11, killed %cpsr
+  ; CHECK: bb.1.for.body:
+  ; CHECK:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   liveins: %r0, %r1, %r2
+  ; CHECK:   %r0, dead %cpsr = tMUL %r2, killed %r0, 14, %noreg
+  ; CHECK:   %r2, dead %cpsr = tADDi8 killed %r2, 1, 14, %noreg
+  ; CHECK:   %r1, %cpsr = tSUBi8 killed %r1, 1, 14, %noreg
+  ; CHECK:   t2Bcc %bb.1, 1, killed %cpsr
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   liveins: %r0
+  ; CHECK:   tBX_RET 14, %noreg, implicit %r0
+  bb.0.entry:
+    successors: %bb.1.for.body, %bb.2.for.cond.cleanup
+    liveins: %r0, %r1
+
+    %r2 = tMOVr %r0, 14, _
+    %r0 = t2MOVi 1, 14, _, _
+    t2CMPri %r1, 1, 14, _, implicit-def %cpsr
+    t2Bcc %bb.2.for.cond.cleanup, 11, killed %cpsr
+
+  bb.1.for.body:
+    successors: %bb.2.for.cond.cleanup, %bb.1.for.body
+    liveins: %r0, %r1, %r2
+
+    %r0 = t2MUL %r2, killed %r0, 14, _
+    %r2 = t2ADDri killed %r2, 1, 14, _, _
+    %r1 = t2SUBri killed %r1, 1, 14, _, def %cpsr
+    t2Bcc %bb.1.for.body, 1, killed %cpsr
+
+  bb.2.for.cond.cleanup:
+    liveins: %r0
+
+    tBX_RET 14, _, implicit %r0
+
+...
diff --git a/test/CodeGen/X86/avg-mask.ll b/test/CodeGen/X86/avg-mask.ll
index 578d7aa75287..d32b0e70791a 100644
--- a/test/CodeGen/X86/avg-mask.ll
+++ b/test/CodeGen/X86/avg-mask.ll
@@ -143,16 +143,8 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64
 ; AVX512F-NEXT:    shrq $32, %rax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %edi, (%rsp)
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm6
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm7
-; AVX512F-NEXT:    vpavgb %xmm7, %xmm6, %xmm6
-; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm7
-; AVX512F-NEXT:    vpavgb %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vinserti128 $1, %xmm7, %ymm1, %ymm1
-; AVX512F-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
 ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
@@ -201,16 +193,8 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin
 ; AVX512F-NEXT:    shrq $32, %rax
 ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %edi, (%rsp)
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm6
-; AVX512F-NEXT:    vpavgb %xmm6, %xmm4, %xmm4
-; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX512F-NEXT:    vpavgb %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
 ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index d1e26b787f48..dd11f6ca2935 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -90,150 +90,23 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
 ; SSE2-LABEL: avg_v32i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm8
-; SSE2-NEXT:    movdqa (%rsi), %xmm0
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm8, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm9
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm12, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm11, %xmm6
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm7, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm10, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm8, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT:    psubd %xmm4, %xmm9
-; SSE2-NEXT:    psubd %xmm4, %xmm2
-; SSE2-NEXT:    psubd %xmm4, %xmm5
-; SSE2-NEXT:    psubd %xmm4, %xmm0
-; SSE2-NEXT:    psubd %xmm4, %xmm6
-; SSE2-NEXT:    psubd %xmm4, %xmm3
-; SSE2-NEXT:    psubd %xmm4, %xmm7
-; SSE2-NEXT:    psubd %xmm4, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm9
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm4, %xmm9
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    packuswb %xmm9, %xmm2
-; SSE2-NEXT:    pand %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    packuswb %xmm5, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    packuswb %xmm6, %xmm3
-; SSE2-NEXT:    pand %xmm4, %xmm7
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm7, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
-; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
+; SSE2-NEXT:    movdqa (%rsi), %xmm1
+; SSE2-NEXT:    pavgb (%rdi), %xmm1
+; SSE2-NEXT:    pavgb 16(%rsi), %xmm0
 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm9
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm8
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm9
-; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm0, %xmm7, %xmm7
-; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm2
-; AVX1-NEXT:    vpand %xmm0, %xmm5, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpand %xmm0, %xmm6, %xmm3
-; AVX1-NEXT:    vpand %xmm0, %xmm9, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -265,444 +138,467 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
   ret void
 }
 
-define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
-; SSE2-LABEL: avg_v64i8:
+define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
+; SSE2-LABEL: avg_v48i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm6
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa (%rsi), %xmm5
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm6
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm11
+; SSE2-NEXT:    movdqa (%rsi), %xmm12
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm13
-; SSE2-NEXT:    movdqa 32(%rsi), %xmm11
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm15
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm15, %xmm14
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm5, %xmm10
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm4, %xmm10
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm12, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm6, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm13, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm4, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm14, %xmm12
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm15, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm13, %xmm15
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm8, %xmm15
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm13
-; SSE2-NEXT:    movdqa %xmm11, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm6, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm5, %xmm9
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm7, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm11, %xmm14
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm2, %xmm14
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm1, %xmm11
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa 48(%rsi), %xmm7
-; SSE2-NEXT:    movdqa %xmm7, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    movdqa 32(%rsi), %xmm0
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm14
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE2-NEXT:    movdqa %xmm12, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm1, %xmm8
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm1, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    psubd %xmm0, %xmm10
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm12
-; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm15
-; SSE2-NEXT:    psubd %xmm0, %xmm13
-; SSE2-NEXT:    psubd %xmm0, %xmm9
-; SSE2-NEXT:    psubd %xmm0, %xmm6
-; SSE2-NEXT:    psubd %xmm0, %xmm14
-; SSE2-NEXT:    psubd %xmm0, %xmm11
-; SSE2-NEXT:    psubd %xmm0, %xmm8
-; SSE2-NEXT:    psubd %xmm0, %xmm3
-; SSE2-NEXT:    psubd %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm0, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm10
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm10
-; SSE2-NEXT:    packuswb %xmm1, %xmm10
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    packuswb %xmm1, %xmm2
-; SSE2-NEXT:    packuswb %xmm10, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
+; SSE2-NEXT:    movdqa %xmm12, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm10, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
+; SSE2-NEXT:    paddd %xmm1, %xmm12
+; SSE2-NEXT:    movdqa %xmm13, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
+; SSE2-NEXT:    movdqa %xmm4, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm15, %xmm10
+; SSE2-NEXT:    movdqa %xmm2, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; SSE2-NEXT:    paddd %xmm5, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
+; SSE2-NEXT:    movdqa %xmm13, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm14, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
+; SSE2-NEXT:    paddd %xmm6, %xmm13
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm14
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm15, %xmm14
+; SSE2-NEXT:    movdqa %xmm11, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE2-NEXT:    paddd %xmm2, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; SSE2-NEXT:    paddd %xmm11, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT:    psubd %xmm5, %xmm8
+; SSE2-NEXT:    psubd %xmm5, %xmm3
+; SSE2-NEXT:    psubd %xmm5, %xmm9
+; SSE2-NEXT:    psubd %xmm5, %xmm12
+; SSE2-NEXT:    psubd %xmm5, %xmm10
+; SSE2-NEXT:    psubd %xmm5, %xmm4
+; SSE2-NEXT:    psubd %xmm5, %xmm1
+; SSE2-NEXT:    psubd %xmm5, %xmm13
+; SSE2-NEXT:    psubd %xmm5, %xmm14
+; SSE2-NEXT:    psubd %xmm5, %xmm6
+; SSE2-NEXT:    psubd %xmm5, %xmm2
+; SSE2-NEXT:    psubd %xmm5, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm8
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255]
+; SSE2-NEXT:    pand %xmm7, %xmm8
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    packuswb %xmm8, %xmm3
 ; SSE2-NEXT:    psrld $1, %xmm12
-; SSE2-NEXT:    pand %xmm0, %xmm12
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    packuswb %xmm12, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm9
+; SSE2-NEXT:    pand %xmm7, %xmm9
+; SSE2-NEXT:    pand %xmm7, %xmm12
+; SSE2-NEXT:    packuswb %xmm9, %xmm12
+; SSE2-NEXT:    packuswb %xmm3, %xmm12
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm10
+; SSE2-NEXT:    pand %xmm7, %xmm10
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    packuswb %xmm10, %xmm4
 ; SSE2-NEXT:    psrld $1, %xmm13
-; SSE2-NEXT:    psrld $1, %xmm15
-; SSE2-NEXT:    pand %xmm0, %xmm15
-; SSE2-NEXT:    pand %xmm0, %xmm13
-; SSE2-NEXT:    packuswb %xmm15, %xmm13
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    pand %xmm7, %xmm13
+; SSE2-NEXT:    packuswb %xmm1, %xmm13
 ; SSE2-NEXT:    packuswb %xmm4, %xmm13
 ; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm9
-; SSE2-NEXT:    pand %xmm0, %xmm9
-; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    packuswb %xmm9, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm11
 ; SSE2-NEXT:    psrld $1, %xmm14
-; SSE2-NEXT:    pand %xmm0, %xmm14
-; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    packuswb %xmm14, %xmm11
-; SSE2-NEXT:    packuswb %xmm6, %xmm11
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    pand %xmm0, %xmm8
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    packuswb %xmm8, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm7
-; SSE2-NEXT:    packuswb %xmm5, %xmm7
-; SSE2-NEXT:    packuswb %xmm3, %xmm7
-; SSE2-NEXT:    movdqu %xmm7, (%rax)
-; SSE2-NEXT:    movdqu %xmm11, (%rax)
+; SSE2-NEXT:    pand %xmm7, %xmm14
+; SSE2-NEXT:    pand %xmm7, %xmm6
+; SSE2-NEXT:    packuswb %xmm14, %xmm6
+; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm6, %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    movdqu %xmm13, (%rax)
-; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    movdqu %xmm12, (%rax)
 ; SSE2-NEXT:    retq
 ;
-; AVX1-LABEL: avg_v64i8:
+; AVX1-LABEL: avg_v48i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    subq $24, %rsp
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vmovdqa %xmm7, (%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm13
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm6, %xmm12
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm15, %xmm11
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm10
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm1, %xmm9, %xmm8
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm2, %xmm14, %xmm9
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm5, %xmm3 # 16-byte Folded Reload
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm5, %xmm2 # 16-byte Folded Reload
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm14
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT:    vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT:    vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm15
-; AVX1-NEXT:    vmovdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpsubd %xmm0, %xmm13, %xmm13
-; AVX1-NEXT:    vpsubd %xmm0, %xmm12, %xmm12
-; AVX1-NEXT:    vpsubd %xmm0, %xmm11, %xmm11
-; AVX1-NEXT:    vpsubd %xmm0, %xmm10, %xmm10
-; AVX1-NEXT:    vpsubd %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm7
-; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm5
+; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm13 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
 ; AVX1-NEXT:    vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm14, %xmm14
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm5, %xmm14, %xmm14
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm14, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm6
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm13, %xmm2
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm6
-; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm7
-; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpackuswb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm6
-; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vmovdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm9
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm2, %xmm11, %xmm11
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm7, %xmm12, %xmm12
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm5, %xmm13, %xmm13
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm1, %xmm15, %xmm15
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm4, %xmm14, %xmm14
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd %xmm6, %xmm10, %xmm6
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpsubd %xmm7, %xmm1, %xmm10
+; AVX1-NEXT:    vpsubd %xmm7, %xmm9, %xmm9
+; AVX1-NEXT:    vpsubd %xmm7, %xmm8, %xmm8
+; AVX1-NEXT:    vpsubd %xmm7, %xmm11, %xmm11
+; AVX1-NEXT:    vpsubd %xmm7, %xmm12, %xmm12
+; AVX1-NEXT:    vpsubd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm7, %xmm13, %xmm4
+; AVX1-NEXT:    vpsubd %xmm7, %xmm15, %xmm5
+; AVX1-NEXT:    vpsubd %xmm7, %xmm14, %xmm1
+; AVX1-NEXT:    vpsubd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubd %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm14
+; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm15
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm13
+; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5
 ; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm6
-; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpackuswb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm12
+; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm11
+; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm6
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
+; AVX1-NEXT:    vpshufb %xmm0, %xmm11, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0]
+; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vpshufb %xmm0, %xmm13, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm3
 ; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovups %ymm0, (%rax)
+; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
-; AVX1-NEXT:    addq $24, %rsp
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v64i8:
+; AVX2-LABEL: avg_v48i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm1, %ymm1
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm3, %ymm3
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm5, %ymm5
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm6, %ymm6
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm7, %ymm7
-; AVX2-NEXT:    vpcmpeqd %ymm8, %ymm8, %ymm8
-; AVX2-NEXT:    vpsubd %ymm8, %ymm0, %ymm9
-; AVX2-NEXT:    vpsubd %ymm8, %ymm1, %ymm10
-; AVX2-NEXT:    vpsubd %ymm8, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubd %ymm8, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubd %ymm8, %ymm5, %ymm5
-; AVX2-NEXT:    vpsubd %ymm8, %ymm6, %ymm1
-; AVX2-NEXT:    vpsubd %ymm8, %ymm7, %ymm0
-; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm11
-; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm12
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX2-NEXT:    vpand %ymm9, %ymm5, %ymm5
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm9, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm4, %ymm7, %ymm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm11, %ymm3
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm0, %ymm10, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
+; AVX2-NEXT:    vpsubd %ymm6, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpsubd %ymm6, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm6, %ymm3, %ymm3
+; AVX2-NEXT:    vpsubd %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpsubd %ymm6, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $1, %ymm5, %ymm5
+; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm4
-; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm6
-; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm7
-; AVX2-NEXT:    vpsrld $1, %ymm10, %ymm8
-; AVX2-NEXT:    vpsrld $1, %ymm9, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm9, %xmm0
-; AVX2-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm8, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm7, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %ymm2, %ymm6, %ymm6
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb %ymm2, %ymm4, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm4
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
+; AVX2-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm12, %ymm4
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufb %ymm2, %ymm11, %ymm2
+; AVX2-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
+; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %ymm6, %ymm5, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: avg_v48i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm5
+; AVX512F-NEXT:    vpavgb %xmm5, %xmm4, %xmm4
+; AVX512F-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v48i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm1
+; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512BW-NEXT:    vpaddd %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; AVX512BW-NEXT:    vpaddd %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsubd %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsubd %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrld $1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vpsrld $1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512BW-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %1 = load <48 x i8>, <48 x i8>* %a
+  %2 = load <48 x i8>, <48 x i8>* %b
+  %3 = zext <48 x i8> %1 to <48 x i32>
+  %4 = zext <48 x i8> %2 to <48 x i32>
+  %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %6 = add nuw nsw <48 x i32> %5, %4
+  %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = trunc <48 x i32> %7 to <48 x i8>
+  store <48 x i8> %8, <48 x i8>* undef, align 4
+  ret void
+}
+
+define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
+; SSE2-LABEL: avg_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm0
+; SSE2-NEXT:    movdqa (%rsi), %xmm1
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm2
+; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
+; SSE2-NEXT:    pavgb (%rdi), %xmm1
+; SSE2-NEXT:    pavgb 16(%rdi), %xmm2
+; SSE2-NEXT:    pavgb 32(%rsi), %xmm0
+; SSE2-NEXT:    pavgb 48(%rdi), %xmm3
+; SSE2-NEXT:    movdqu %xmm3, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: avg_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpavgb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpavgb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovups %ymm1, (%rax)
+; AVX1-NEXT:    vmovups %ymm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: avg_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX2-NEXT:    vpavgb (%rdi), %ymm1, %ymm1
+; AVX2-NEXT:    vpavgb 32(%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: avg_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512F-NEXT:    vpsubd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vpsubd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsubd %zmm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsubd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT:    vpsrld $1, %zmm3, %zmm3
-; AVX512F-NEXT:    vpsrld $1, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovdb %zmm2, %xmm1
-; AVX512F-NEXT:    vpmovdb %zmm3, %xmm2
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512F-NEXT:    vpavgb (%rdi), %ymm1, %ymm1
+; AVX512F-NEXT:    vpavgb 32(%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -782,81 +678,23 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
 ; SSE2-LABEL: avg_v16i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm2
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm4
-; SSE2-NEXT:    movdqa (%rsi), %xmm0
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-NEXT:    paddd %xmm7, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT:    psubd %xmm4, %xmm3
-; SSE2-NEXT:    psubd %xmm4, %xmm0
-; SSE2-NEXT:    psubd %xmm4, %xmm2
-; SSE2-NEXT:    psubd %xmm4, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm3
-; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm0
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    packssdw %xmm3, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm2
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    packssdw %xmm2, %xmm1
-; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
+; SSE2-NEXT:    movdqa (%rsi), %xmm1
+; SSE2-NEXT:    pavgw (%rdi), %xmm1
+; SSE2-NEXT:    pavgw 16(%rsi), %xmm0
 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpavgw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -891,207 +729,60 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
 ; SSE2-LABEL: avg_v32i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm4
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm11
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm10
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm8
-; SSE2-NEXT:    movdqa (%rsi), %xmm9
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm0
+; SSE2-NEXT:    movdqa (%rsi), %xmm1
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm2
 ; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm11, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm10, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm8, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm9, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm4, %xmm9
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm5, %xmm6
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm11, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm12, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm10, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm13, %xmm4
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm8, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm7
-; SSE2-NEXT:    psubd %xmm0, %xmm9
-; SSE2-NEXT:    psubd %xmm0, %xmm6
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    psubd %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm9
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    pslld $16, %xmm7
-; SSE2-NEXT:    psrad $16, %xmm7
-; SSE2-NEXT:    pslld $16, %xmm9
-; SSE2-NEXT:    psrad $16, %xmm9
-; SSE2-NEXT:    packssdw %xmm7, %xmm9
-; SSE2-NEXT:    pslld $16, %xmm6
-; SSE2-NEXT:    psrad $16, %xmm6
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    packssdw %xmm6, %xmm1
-; SSE2-NEXT:    pslld $16, %xmm5
-; SSE2-NEXT:    psrad $16, %xmm5
-; SSE2-NEXT:    pslld $16, %xmm2
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    packssdw %xmm5, %xmm2
-; SSE2-NEXT:    pslld $16, %xmm4
-; SSE2-NEXT:    psrad $16, %xmm4
-; SSE2-NEXT:    pslld $16, %xmm3
-; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    packssdw %xmm4, %xmm3
+; SSE2-NEXT:    pavgw (%rdi), %xmm1
+; SSE2-NEXT:    pavgw 16(%rdi), %xmm2
+; SSE2-NEXT:    pavgw 32(%rsi), %xmm0
+; SSE2-NEXT:    pavgw 48(%rdi), %xmm3
 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm9, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v32i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm9
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm8
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm9
-; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
-; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1-NEXT:    vpackusdw %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovups %ymm0, (%rax)
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpavgw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpavgw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vpavgw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpavgw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
+; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v32i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpsubd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX2-NEXT:    vpavgw (%rdi), %ymm1, %ymm1
+; AVX2-NEXT:    vpavgw 32(%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: avg_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpsubd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdw %zmm0, (%rax)
-; AVX512F-NEXT:    vpmovdw %zmm1, (%rax)
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512F-NEXT:    vpavgw (%rdi), %ymm1, %ymm1
+; AVX512F-NEXT:    vpavgw 32(%rsi), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1199,150 +890,23 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {
 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
 ; SSE2-LABEL: avg_v32i8_2:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm8
-; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm8, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm9
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm12, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm11, %xmm6
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm7, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm10, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm8, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT:    psubd %xmm4, %xmm9
-; SSE2-NEXT:    psubd %xmm4, %xmm2
-; SSE2-NEXT:    psubd %xmm4, %xmm5
-; SSE2-NEXT:    psubd %xmm4, %xmm0
-; SSE2-NEXT:    psubd %xmm4, %xmm6
-; SSE2-NEXT:    psubd %xmm4, %xmm3
-; SSE2-NEXT:    psubd %xmm4, %xmm7
-; SSE2-NEXT:    psubd %xmm4, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm9
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm4, %xmm9
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    packuswb %xmm9, %xmm2
-; SSE2-NEXT:    pand %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    packuswb %xmm5, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    packuswb %xmm6, %xmm3
-; SSE2-NEXT:    pand %xmm4, %xmm7
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm7, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    pavgb (%rsi), %xmm0
+; SSE2-NEXT:    pavgb 16(%rdi), %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v32i8_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm9
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm8
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm9
-; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm0, %xmm7, %xmm7
-; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm2
-; AVX1-NEXT:    vpand %xmm0, %xmm5, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpand %xmm0, %xmm6, %xmm3
-; AVX1-NEXT:    vpand %xmm0, %xmm9, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1377,249 +941,31 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
 ; SSE2-LABEL: avg_v64i8_2:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rsi), %xmm14
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm12
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
-; SSE2-NEXT:    movdqa 48(%rsi), %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm14, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm15
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm14, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm12, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm6, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm12, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm5, %xmm11
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm1, %xmm1
-; SSE2-NEXT:    paddd %xmm3, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    paddd %xmm4, %xmm4
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm3, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm2
-; SSE2-NEXT:    paddd %xmm10, %xmm10
-; SSE2-NEXT:    paddd %xmm5, %xmm5
-; SSE2-NEXT:    paddd %xmm11, %xmm11
-; SSE2-NEXT:    paddd %xmm12, %xmm12
-; SSE2-NEXT:    paddd %xmm9, %xmm9
-; SSE2-NEXT:    paddd %xmm6, %xmm6
-; SSE2-NEXT:    paddd %xmm13, %xmm13
-; SSE2-NEXT:    paddd %xmm14, %xmm14
-; SSE2-NEXT:    paddd %xmm8, %xmm8
-; SSE2-NEXT:    paddd %xmm7, %xmm7
-; SSE2-NEXT:    paddd %xmm15, %xmm15
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm15
-; SSE2-NEXT:    psubd %xmm0, %xmm7
-; SSE2-NEXT:    psubd %xmm0, %xmm8
-; SSE2-NEXT:    psubd %xmm0, %xmm14
-; SSE2-NEXT:    psubd %xmm0, %xmm13
-; SSE2-NEXT:    psubd %xmm0, %xmm6
-; SSE2-NEXT:    psubd %xmm0, %xmm9
-; SSE2-NEXT:    psubd %xmm0, %xmm12
-; SSE2-NEXT:    psubd %xmm0, %xmm11
-; SSE2-NEXT:    psubd %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm0, %xmm10
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    psubd %xmm0, %xmm3
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm15
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm15
-; SSE2-NEXT:    pand %xmm0, %xmm7
-; SSE2-NEXT:    packuswb %xmm15, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm14
-; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    pand %xmm0, %xmm8
-; SSE2-NEXT:    pand %xmm0, %xmm14
-; SSE2-NEXT:    packuswb %xmm8, %xmm14
-; SSE2-NEXT:    packuswb %xmm7, %xmm14
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm13
-; SSE2-NEXT:    pand %xmm0, %xmm13
-; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    packuswb %xmm13, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm12
-; SSE2-NEXT:    psrld $1, %xmm9
-; SSE2-NEXT:    pand %xmm0, %xmm9
-; SSE2-NEXT:    pand %xmm0, %xmm12
-; SSE2-NEXT:    packuswb %xmm9, %xmm12
-; SSE2-NEXT:    packuswb %xmm6, %xmm12
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    packuswb %xmm11, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm10
-; SSE2-NEXT:    pand %xmm0, %xmm10
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    packuswb %xmm10, %xmm2
-; SSE2-NEXT:    packuswb %xmm5, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    packuswb %xmm5, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm5, %xmm1
-; SSE2-NEXT:    packuswb %xmm4, %xmm1
-; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
+; SSE2-NEXT:    pavgb %xmm0, %xmm0
+; SSE2-NEXT:    pavgb %xmm1, %xmm1
+; SSE2-NEXT:    pavgb %xmm2, %xmm2
+; SSE2-NEXT:    pavgb %xmm3, %xmm3
+; SSE2-NEXT:    movdqu %xmm3, (%rax)
 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
-; SSE2-NEXT:    movdqu %xmm12, (%rax)
-; SSE2-NEXT:    movdqu %xmm14, (%rax)
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v64i8_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm7, %xmm7
-; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpaddd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpaddd %xmm5, %xmm5, %xmm6
-; AVX1-NEXT:    vpaddd %xmm4, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm4
-; AVX1-NEXT:    vpaddd %xmm2, %xmm2, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm1, %xmm2
-; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddd %xmm15, %xmm15, %xmm15
-; AVX1-NEXT:    vpaddd %xmm14, %xmm14, %xmm14
-; AVX1-NEXT:    vpaddd %xmm13, %xmm13, %xmm13
-; AVX1-NEXT:    vpaddd %xmm12, %xmm12, %xmm12
-; AVX1-NEXT:    vpaddd %xmm11, %xmm11, %xmm11
-; AVX1-NEXT:    vpaddd %xmm10, %xmm10, %xmm10
-; AVX1-NEXT:    vpaddd %xmm9, %xmm9, %xmm9
-; AVX1-NEXT:    vpaddd %xmm8, %xmm8, %xmm8
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm0, %xmm8, %xmm7
-; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm8
-; AVX1-NEXT:    vpsubd %xmm0, %xmm10, %xmm10
-; AVX1-NEXT:    vpsubd %xmm0, %xmm11, %xmm9
-; AVX1-NEXT:    vpsubd %xmm0, %xmm12, %xmm7
-; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpsubd %xmm0, %xmm13, %xmm11
-; AVX1-NEXT:    vpsubd %xmm0, %xmm14, %xmm13
-; AVX1-NEXT:    vpsubd %xmm0, %xmm15, %xmm12
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm15
-; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm2
-; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm14
-; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm3
-; AVX1-NEXT:    vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm5
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm6
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm8
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpackuswb %xmm8, %xmm6, %xmm8
-; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm6
-; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm4
-; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpackuswb %xmm8, %xmm4, %xmm4
-; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm6
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm3
-; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm6
-; AVX1-NEXT:    vpsrld $1, %xmm13, %xmm0
-; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vpsrld $1, %xmm15, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm14, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm2
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX1-NEXT:    vpavgb %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpavgb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vpavgb %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpavgb %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
@@ -1628,71 +974,10 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
 ;
 ; AVX2-LABEL: avg_v64i8_2:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm7, %ymm7, %ymm7
-; AVX2-NEXT:    vpaddd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT:    vpaddd %ymm5, %ymm5, %ymm5
-; AVX2-NEXT:    vpaddd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm3, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm8, %ymm8, %ymm8
-; AVX2-NEXT:    vpsubd %ymm8, %ymm0, %ymm9
-; AVX2-NEXT:    vpsubd %ymm8, %ymm1, %ymm10
-; AVX2-NEXT:    vpsubd %ymm8, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubd %ymm8, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubd %ymm8, %ymm5, %ymm5
-; AVX2-NEXT:    vpsubd %ymm8, %ymm6, %ymm1
-; AVX2-NEXT:    vpsubd %ymm8, %ymm7, %ymm0
-; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm11
-; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm12
-; AVX2-NEXT:    vpsrld $1, %ymm5, %ymm5
-; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm4
-; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm6
-; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm7
-; AVX2-NEXT:    vpsrld $1, %ymm10, %ymm8
-; AVX2-NEXT:    vpsrld $1, %ymm9, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm9, %xmm0
-; AVX2-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
-; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm8, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm7, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %ymm2, %ymm6, %ymm6
-; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb %ymm2, %ymm4, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm4
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm12, %ymm4
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufb %ymm2, %ymm11, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX2-NEXT:    vpavgb %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpavgb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
@@ -1700,29 +985,10 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
 ;
 ; AVX512F-LABEL: avg_v64i8_2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpaddd %zmm3, %zmm3, %zmm3
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm1, %zmm1
-; AVX512F-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512F-NEXT:    vpsubd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vpsubd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsubd %zmm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsubd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT:    vpsrld $1, %zmm3, %zmm3
-; AVX512F-NEXT:    vpsrld $1, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovdb %zmm2, %xmm1
-; AVX512F-NEXT:    vpmovdb %zmm3, %xmm2
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512F-NEXT:    vpavgb %ymm0, %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
 ; AVX512F-NEXT:    vzeroupper
@@ -1805,81 +1071,23 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {
 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
 ; SSE2-LABEL: avg_v16i16_2:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm2
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm4
-; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-NEXT:    paddd %xmm7, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT:    psubd %xmm4, %xmm3
-; SSE2-NEXT:    psubd %xmm4, %xmm0
-; SSE2-NEXT:    psubd %xmm4, %xmm2
-; SSE2-NEXT:    psubd %xmm4, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm3
-; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm0
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    packssdw %xmm3, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm2
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    pavgw (%rsi), %xmm0
+; SSE2-NEXT:    pavgw 16(%rdi), %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v16i16_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpavgw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1914,187 +1122,47 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
 ; SSE2-LABEL: avg_v32i16_2:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm4
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm11
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm10
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm8
-; SSE2-NEXT:    movdqa (%rsi), %xmm9
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
-; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm11, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm10, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm8, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm9, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm4, %xmm9
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm5, %xmm6
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm11, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm12, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm10, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm13, %xmm4
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm8, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm7
-; SSE2-NEXT:    psubd %xmm0, %xmm9
-; SSE2-NEXT:    psubd %xmm0, %xmm6
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    psubd %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm9
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    pslld $16, %xmm7
-; SSE2-NEXT:    psrad $16, %xmm7
-; SSE2-NEXT:    pslld $16, %xmm9
-; SSE2-NEXT:    psrad $16, %xmm9
-; SSE2-NEXT:    packssdw %xmm7, %xmm9
-; SSE2-NEXT:    pslld $16, %xmm6
-; SSE2-NEXT:    psrad $16, %xmm6
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    packssdw %xmm6, %xmm1
-; SSE2-NEXT:    pslld $16, %xmm5
-; SSE2-NEXT:    psrad $16, %xmm5
-; SSE2-NEXT:    pslld $16, %xmm2
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    packssdw %xmm5, %xmm2
-; SSE2-NEXT:    pslld $16, %xmm4
-; SSE2-NEXT:    psrad $16, %xmm4
-; SSE2-NEXT:    pslld $16, %xmm3
-; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    packssdw %xmm4, %xmm3
-; SSE2-NEXT:    movdqu %xmm3, (%rax)
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    movdqa 48(%rdi), %xmm2
+; SSE2-NEXT:    movdqa 32(%rsi), %xmm3
+; SSE2-NEXT:    pavgw (%rsi), %xmm0
+; SSE2-NEXT:    pavgw 16(%rsi), %xmm1
+; SSE2-NEXT:    pavgw 32(%rdi), %xmm3
+; SSE2-NEXT:    pavgw 48(%rsi), %xmm2
 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm3, (%rax)
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm9, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v32i16_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm9
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm8
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm9
-; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
-; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1-NEXT:    vpackusdw %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovups %ymm0, (%rax)
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpavgw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpavgw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpavgw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpavgw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
+; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v32i16_2:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpsubd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vpavgw 32(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
@@ -2102,19 +1170,12 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
 ;
 ; AVX512F-LABEL: avg_v32i16_2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpsubd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdw %zmm0, (%rax)
-; AVX512F-NEXT:    vpmovdw %zmm1, (%rax)
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512F-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgw 32(%rdi), %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -2209,89 +1270,21 @@ define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
 define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
 ; SSE2-LABEL: avg_v32i8_const:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm0, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [1,2,3,4]
-; SSE2-NEXT:    paddd %xmm9, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [5,6,7,8]
-; SSE2-NEXT:    paddd %xmm4, %xmm8
-; SSE2-NEXT:    paddd %xmm9, %xmm2
-; SSE2-NEXT:    paddd %xmm4, %xmm5
-; SSE2-NEXT:    paddd %xmm9, %xmm3
-; SSE2-NEXT:    paddd %xmm4, %xmm6
-; SSE2-NEXT:    paddd %xmm9, %xmm1
-; SSE2-NEXT:    paddd %xmm4, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    packuswb %xmm7, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    packuswb %xmm6, %xmm3
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    packuswb %xmm5, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    packuswb %xmm8, %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    pavgb 16(%rdi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm2, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v32i8_const:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,2,3,4]
-; AVX1-NEXT:    vpaddd %xmm0, %xmm7, %xmm9
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [5,6,7,8]
-; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX1-NEXT:    vpavgb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -2324,211 +1317,33 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
 define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
 ; SSE2-LABEL: avg_v64i8_const:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm5
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm6
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm15
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm11
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm9
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm15, %xmm14
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm14, %xmm13
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm15, %xmm12
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm8
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm5, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [5,6,7,8]
-; SSE2-NEXT:    paddd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm0, %xmm6
-; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm0, %xmm15
-; SSE2-NEXT:    paddd %xmm0, %xmm14
-; SSE2-NEXT:    paddd %xmm0, %xmm11
-; SSE2-NEXT:    paddd %xmm0, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,2,3,4]
-; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm4
-; SSE2-NEXT:    paddd %xmm0, %xmm8
-; SSE2-NEXT:    paddd %xmm0, %xmm12
-; SSE2-NEXT:    paddd %xmm0, %xmm13
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    paddd %xmm0, %xmm10
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm7
-; SSE2-NEXT:    packuswb %xmm5, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
-; SSE2-NEXT:    packuswb %xmm7, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    packuswb %xmm6, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm8
-; SSE2-NEXT:    packuswb %xmm3, %xmm8
-; SSE2-NEXT:    packuswb %xmm4, %xmm8
-; SSE2-NEXT:    psrld $1, %xmm12
-; SSE2-NEXT:    psrld $1, %xmm15
-; SSE2-NEXT:    pand %xmm0, %xmm15
-; SSE2-NEXT:    pand %xmm0, %xmm12
-; SSE2-NEXT:    packuswb %xmm15, %xmm12
-; SSE2-NEXT:    psrld $1, %xmm13
-; SSE2-NEXT:    psrld $1, %xmm14
-; SSE2-NEXT:    pand %xmm0, %xmm14
-; SSE2-NEXT:    pand %xmm0, %xmm13
-; SSE2-NEXT:    packuswb %xmm14, %xmm13
-; SSE2-NEXT:    packuswb %xmm12, %xmm13
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    packuswb %xmm11, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm10
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm10
-; SSE2-NEXT:    packuswb %xmm3, %xmm10
-; SSE2-NEXT:    packuswb %xmm2, %xmm10
-; SSE2-NEXT:    movdqu %xmm10, (%rax)
-; SSE2-NEXT:    movdqu %xmm13, (%rax)
-; SSE2-NEXT:    movdqu %xmm8, (%rax)
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE2-NEXT:    pavgb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm3
+; SSE2-NEXT:    pavgb %xmm0, %xmm3
+; SSE2-NEXT:    pavgb 48(%rdi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    movdqu %xmm3, (%rax)
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v64i8_const:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,6,7,8]
-; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddd %xmm0, %xmm7, %xmm15
-; AVX1-NEXT:    vpaddd %xmm0, %xmm13, %xmm13
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm7
-; AVX1-NEXT:    vpaddd %xmm0, %xmm11, %xmm11
-; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm1
-; AVX1-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vpaddd %xmm0, %xmm9, %xmm9
-; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
-; AVX1-NEXT:    vpaddd %xmm2, %xmm12, %xmm0
-; AVX1-NEXT:    vpaddd %xmm2, %xmm10, %xmm10
-; AVX1-NEXT:    vpaddd %xmm2, %xmm8, %xmm8
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddd %xmm2, %xmm14, %xmm6
-; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm2, %xmm12 # 16-byte Folded Reload
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm14
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm15, %xmm2
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm13, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm7, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm2
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm2
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm14, %xmm3
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX1-NEXT:    vpavgb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpavgb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpavgb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -2536,82 +1351,21 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
 ;
 ; AVX2-LABEL: avg_v64i8_const:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [1,2,3,4,5,6,7,8]
-; AVX2-NEXT:    vpaddd %ymm8, %ymm7, %ymm7
-; AVX2-NEXT:    vpaddd %ymm8, %ymm6, %ymm6
-; AVX2-NEXT:    vpaddd %ymm8, %ymm5, %ymm5
-; AVX2-NEXT:    vpaddd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm8, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm8, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm8, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm8, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm8
-; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm4
-; AVX2-NEXT:    vpsrld $1, %ymm5, %ymm5
-; AVX2-NEXT:    vpsrld $1, %ymm6, %ymm6
-; AVX2-NEXT:    vpsrld $1, %ymm7, %ymm7
-; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm0
-; AVX2-NEXT:    vpackssdw %xmm0, %xmm7, %xmm0
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT:    vpackssdw %xmm7, %xmm6, %xmm6
-; AVX2-NEXT:    vpackuswb %xmm0, %xmm6, %xmm0
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT:    vpackssdw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6
-; AVX2-NEXT:    vpackssdw %xmm6, %xmm4, %xmm4
-; AVX2-NEXT:    vpackuswb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vpackssdw %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm8, %xmm3
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm1
+; AVX2-NEXT:    vpavgb 32(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: avg_v64i8_const:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
-; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsrld $1, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsrld $1, %zmm3, %zmm3
-; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX512F-NEXT:    vpavgb (%rdi), %ymm0, %ymm1
+; AVX512F-NEXT:    vpavgb 32(%rdi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
-; AVX512F-NEXT:    vmovdqu %ymm2, (%rax)
+; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -2680,57 +1434,21 @@ define void @avg_v8i16_const(<8 x i16>* %a) nounwind {
 define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
 ; SSE2-LABEL: avg_v16i16_const:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [5,6,7,8]
-; SSE2-NEXT:    paddd %xmm4, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,3,4]
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm5, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm3
-; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm2
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    packssdw %xmm3, %xmm2
-; SSE2-NEXT:    pslld $16, %xmm0
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    packssdw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pavgw %xmm0, %xmm1
+; SSE2-NEXT:    pavgw 16(%rdi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm2, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v16i16_const:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2,3,4]
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,6,7,8]
-; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpavgw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpavgw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -2763,148 +1481,57 @@ define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
 define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
 ; SSE2-LABEL: avg_v32i16_const:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm7
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm6
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm4
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [5,6,7,8]
-; SSE2-NEXT:    paddd %xmm8, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [1,2,3,4]
-; SSE2-NEXT:    paddd %xmm9, %xmm5
-; SSE2-NEXT:    paddd %xmm8, %xmm6
-; SSE2-NEXT:    paddd %xmm9, %xmm3
-; SSE2-NEXT:    paddd %xmm8, %xmm4
-; SSE2-NEXT:    paddd %xmm9, %xmm2
-; SSE2-NEXT:    paddd %xmm8, %xmm0
-; SSE2-NEXT:    paddd %xmm9, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    pslld $16, %xmm7
-; SSE2-NEXT:    psrad $16, %xmm7
-; SSE2-NEXT:    pslld $16, %xmm5
-; SSE2-NEXT:    psrad $16, %xmm5
-; SSE2-NEXT:    packssdw %xmm7, %xmm5
-; SSE2-NEXT:    pslld $16, %xmm6
-; SSE2-NEXT:    psrad $16, %xmm6
-; SSE2-NEXT:    pslld $16, %xmm3
-; SSE2-NEXT:    psrad $16, %xmm3
-; SSE2-NEXT:    packssdw %xmm6, %xmm3
-; SSE2-NEXT:    pslld $16, %xmm4
-; SSE2-NEXT:    psrad $16, %xmm4
-; SSE2-NEXT:    pslld $16, %xmm2
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    packssdw %xmm4, %xmm2
-; SSE2-NEXT:    pslld $16, %xmm0
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    packssdw %xmm0, %xmm1
-; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    pavgw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE2-NEXT:    pavgw %xmm0, %xmm2
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm3
+; SSE2-NEXT:    pavgw %xmm0, %xmm3
+; SSE2-NEXT:    pavgw 48(%rdi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
-; SSE2-NEXT:    movdqu %xmm5, (%rax)
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v32i16_const:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,2,3,4]
-; AVX1-NEXT:    vpaddd %xmm0, %xmm7, %xmm9
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [5,6,7,8]
-; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm5
-; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm3
-; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
+; AVX1-NEXT:    vpavgw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpavgw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpavgw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpavgw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovups %ymm1, (%rax)
 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
-; AVX1-NEXT:    vmovups %ymm2, (%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v32i16_const:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,2,3,4,5,6,7,8]
-; AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX2-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm1
+; AVX2-NEXT:    vpavgw 32(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
-; AVX2-NEXT:    vmovdqu %ymm2, (%rax)
+; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: avg_v32i16_const:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
-; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1
-; AVX512F-NEXT:    vpmovdw %zmm1, (%rax)
-; AVX512F-NEXT:    vpmovdw %zmm0, (%rax)
+; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512F-NEXT:    vpavgw (%rdi), %ymm0, %ymm1
+; AVX512F-NEXT:    vpavgw 32(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -2946,77 +1573,16 @@ define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; SSE2-LABEL: avg_v32i8_3:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; SSE2-NEXT:    paddw %xmm6, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-NEXT:    paddw %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
-; SSE2-NEXT:    paddw %xmm7, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT:    paddw %xmm3, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    psubw %xmm3, %xmm4
-; SSE2-NEXT:    psubw %xmm3, %xmm0
-; SSE2-NEXT:    psubw %xmm3, %xmm2
-; SSE2-NEXT:    psubw %xmm3, %xmm1
-; SSE2-NEXT:    psrlw $1, %xmm1
-; SSE2-NEXT:    psrlw $1, %xmm2
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    packuswb %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    pavgb %xmm2, %xmm0
+; SSE2-NEXT:    pavgb %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v32i8_3:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT:    vpaddw %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT:    vpaddw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpaddw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubw %xmm1, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm1
-; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -3041,203 +1607,36 @@ define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; SSE2-LABEL: avg_v64i8_3:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm9, %xmm9
-; SSE2-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm11
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm12
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm13
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm8
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
-; SSE2-NEXT:    paddw %xmm10, %xmm8
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
-; SSE2-NEXT:    paddw %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
-; SSE2-NEXT:    paddw %xmm11, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
-; SSE2-NEXT:    paddw %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm6, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
-; SSE2-NEXT:    paddw %xmm12, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
-; SSE2-NEXT:    paddw %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm7, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
-; SSE2-NEXT:    paddw %xmm13, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
-; SSE2-NEXT:    paddw %xmm7, %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
-; SSE2-NEXT:    psubw %xmm7, %xmm8
-; SSE2-NEXT:    psubw %xmm7, %xmm0
-; SSE2-NEXT:    psubw %xmm7, %xmm4
-; SSE2-NEXT:    psubw %xmm7, %xmm1
-; SSE2-NEXT:    psubw %xmm7, %xmm5
-; SSE2-NEXT:    psubw %xmm7, %xmm2
-; SSE2-NEXT:    psubw %xmm7, %xmm6
-; SSE2-NEXT:    psubw %xmm7, %xmm3
-; SSE2-NEXT:    psrlw $1, %xmm3
-; SSE2-NEXT:    psrlw $1, %xmm6
-; SSE2-NEXT:    psrlw $1, %xmm2
-; SSE2-NEXT:    psrlw $1, %xmm5
-; SSE2-NEXT:    psrlw $1, %xmm1
-; SSE2-NEXT:    psrlw $1, %xmm4
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm8
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm7, %xmm8
-; SSE2-NEXT:    pand %xmm7, %xmm0
-; SSE2-NEXT:    packuswb %xmm8, %xmm0
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    pand %xmm7, %xmm1
-; SSE2-NEXT:    packuswb %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm7, %xmm5
-; SSE2-NEXT:    pand %xmm7, %xmm2
-; SSE2-NEXT:    packuswb %xmm5, %xmm2
-; SSE2-NEXT:    pand %xmm7, %xmm6
-; SSE2-NEXT:    pand %xmm7, %xmm3
-; SSE2-NEXT:    packuswb %xmm6, %xmm3
+; SSE2-NEXT:    pavgb %xmm4, %xmm0
+; SSE2-NEXT:    pavgb %xmm5, %xmm1
+; SSE2-NEXT:    pavgb %xmm6, %xmm2
+; SSE2-NEXT:    pavgb %xmm7, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v64i8_3:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpaddw %xmm7, %xmm5, %xmm12
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpaddw %xmm1, %xmm4, %xmm13
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpaddw %xmm4, %xmm6, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm15
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpaddw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpaddw %xmm2, %xmm11, %xmm2
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpaddw %xmm7, %xmm9, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpaddw %xmm3, %xmm10, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpsubw %xmm5, %xmm12, %xmm8
-; AVX1-NEXT:    vpsubw %xmm5, %xmm13, %xmm4
-; AVX1-NEXT:    vpsubw %xmm5, %xmm14, %xmm0
-; AVX1-NEXT:    vpsubw %xmm5, %xmm15, %xmm1
-; AVX1-NEXT:    vpsubw %xmm5, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubw %xmm5, %xmm7, %xmm7
-; AVX1-NEXT:    vpsubw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm9
-; AVX1-NEXT:    vpsrlw $1, %xmm7, %xmm5
-; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $1, %xmm6, %xmm6
-; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpsrlw $1, %xmm8, %xmm7
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm3, %xmm7, %xmm7
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm7[0],xmm4[0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpavgb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm1
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
-; AVX1-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v64i8_3:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
-; AVX2-NEXT:    vpaddw %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX2-NEXT:    vpaddw %ymm2, %ymm5, %ymm2
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
-; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubw %ymm3, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm2
-; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlw $1, %ymm4, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: avg_v64i8_3:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm5
-; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm6
-; AVX512F-NEXT:    vpavgb %xmm6, %xmm4, %xmm4
-; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm6
-; AVX512F-NEXT:    vpavgb %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v64i8_3:
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index 6e6d61f37d2e..248462d0de51 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -17,78 +17,40 @@ define <16 x i1> @test1() {
 }
 
 define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
-; KNL-LABEL: test2:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: test2:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
-; SKX-NEXT:    vpmovb2m %xmm1, %k0
-; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
-; SKX-NEXT:    vpmovb2m %xmm0, %k1
-; SKX-NEXT:    kandw %k0, %k1, %k0
-; SKX-NEXT:    vpmovm2b %k0, %xmm0
-; SKX-NEXT:    retq
+; ALL_X64-LABEL: test2:
+; ALL_X64:       ## %bb.0:
+; ALL_X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL_X64-NEXT:    vpsllw $7, %xmm0, %xmm0
+; ALL_X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; ALL_X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; ALL_X64-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; ALL_X64-NEXT:    retq
 ;
 ; KNL_X32-LABEL: test2:
 ; KNL_X32:       ## %bb.0:
-; KNL_X32-NEXT:    vpmovsxbd %xmm1, %zmm1
-; KNL_X32-NEXT:    vpslld $31, %zmm1, %zmm1
-; KNL_X32-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL_X32-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL_X32-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL_X32-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_X32-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL_X32-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL_X32-NEXT:    vpsllw $7, %xmm0, %xmm0
+; KNL_X32-NEXT:    vpand LCPI1_0, %xmm0, %xmm0
+; KNL_X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL_X32-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; KNL_X32-NEXT:    retl
   %c = and <16 x i1>%a, %b
   ret <16 x i1> %c
 }
 
 define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
-; KNL-LABEL: test3:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
-; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
-; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
-; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL-NEXT:    ## kill: def %xmm0 killed %xmm0 killed %ymm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: test3:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
-; SKX-NEXT:    vpmovw2m %xmm1, %k0
-; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
-; SKX-NEXT:    vpmovw2m %xmm0, %k1
-; SKX-NEXT:    kandb %k0, %k1, %k0
-; SKX-NEXT:    vpmovm2w %k0, %xmm0
-; SKX-NEXT:    retq
+; ALL_X64-LABEL: test3:
+; ALL_X64:       ## %bb.0:
+; ALL_X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL_X64-NEXT:    vpsllw $15, %xmm0, %xmm0
+; ALL_X64-NEXT:    vpsraw $15, %xmm0, %xmm0
+; ALL_X64-NEXT:    retq
 ;
 ; KNL_X32-LABEL: test3:
 ; KNL_X32:       ## %bb.0:
-; KNL_X32-NEXT:    vpmovsxwq %xmm1, %zmm1
-; KNL_X32-NEXT:    vpsllq $63, %zmm1, %zmm1
-; KNL_X32-NEXT:    vpmovsxwq %xmm0, %zmm0
-; KNL_X32-NEXT:    vpsllq $63, %zmm0, %zmm0
-; KNL_X32-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL_X32-NEXT:    vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_X32-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL_X32-NEXT:    ## kill: def %xmm0 killed %xmm0 killed %ymm0
+; KNL_X32-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL_X32-NEXT:    vpsllw $15, %xmm0, %xmm0
+; KNL_X32-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; KNL_X32-NEXT:    retl
   %c = and <8 x i1>%a, %b
   ret <8 x i1> %c
@@ -102,11 +64,9 @@ define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) {
 ;
 ; SKX-LABEL: test4:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
-; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k0 {%k1}
-; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; SKX-NEXT:    retq
 ;
 ; KNL_X32-LABEL: test4:
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index 97beff63811a..8c7941591217 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -1366,21 +1366,12 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
 }
 
 define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
-; KNL-LABEL: trunc_4i32_to_4i1:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
-; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_4i32_to_4i1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
-; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; SKX-NEXT:    vpslld $31, %xmm1, %xmm0
-; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0 {%k1}
-; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    retq
+; ALL-LABEL: trunc_4i32_to_4i1:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    vpslld $31, %xmm0, %xmm0
+; ALL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; ALL-NEXT:    retq
   %mask_a = trunc <4 x i32>%a to <4 x i1>
   %mask_b = trunc <4 x i32>%b to <4 x i1>
   %a_and_b = and <4 x i1>%mask_a, %mask_b
diff --git a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
index 34ea468aebee..e1ed8ea98a1c 100644
--- a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
+++ b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
@@ -249,9 +249,9 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovd (%rdi), %k0
 ; AVX512-NEXT:    kshiftrd $16, %k0, %k0
-; AVX512-NEXT:    vpmovm2q %k0, %zmm2
-; AVX512-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512-NEXT:    vpmovq2m %zmm2, %k1
+; AVX512-NEXT:    vpmovm2d %k0, %ymm2
+; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512-NEXT:    vpmovd2m %ymm2, %k1
 ; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
 ; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
 ; AVX512-NEXT:    vzeroupper
@@ -261,10 +261,11 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
-; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm2, %zmm2
-; AVX512NOTDQ-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512NOTDQ-NEXT:    vpslld $31, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
 ; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
 ; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
 ; AVX512NOTDQ-NEXT:    vzeroupper
@@ -340,10 +341,10 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovd (%rdi), %k0
 ; AVX512-NEXT:    kshiftrd $24, %k0, %k0
-; AVX512-NEXT:    vpmovm2q %k0, %zmm2
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
-; AVX512-NEXT:    vpermq %zmm2, %zmm3, %zmm2
-; AVX512-NEXT:    vpmovq2m %zmm2, %k1
+; AVX512-NEXT:    vpmovm2d %k0, %ymm2
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512-NEXT:    vpmovd2m %ymm2, %k1
 ; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
 ; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
 ; AVX512-NEXT:    vzeroupper
@@ -353,11 +354,12 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrd $24, %k0, %k1
-; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512NOTDQ-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
-; AVX512NOTDQ-NEXT:    vpermq %zmm2, %zmm3, %zmm2
-; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm2, %zmm2
-; AVX512NOTDQ-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
+; AVX512NOTDQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512NOTDQ-NEXT:    vpslld $31, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
 ; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
 ; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
 ; AVX512NOTDQ-NEXT:    vzeroupper
@@ -433,9 +435,9 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovq (%rdi), %k0
 ; AVX512-NEXT:    kshiftrq $32, %k0, %k0
-; AVX512-NEXT:    vpmovm2q %k0, %zmm2
-; AVX512-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512-NEXT:    vpmovq2m %zmm2, %k1
+; AVX512-NEXT:    vpmovm2d %k0, %ymm2
+; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512-NEXT:    vpmovd2m %ymm2, %k1
 ; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
 ; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
 ; AVX512-NEXT:    vzeroupper
@@ -445,10 +447,11 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %zmm2
-; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm2, %zmm2
-; AVX512NOTDQ-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512NOTDQ-NEXT:    vpslld $31, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
 ; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
 ; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
 ; AVX512NOTDQ-NEXT:    vzeroupper
@@ -555,10 +558,10 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovq (%rdi), %k0
 ; AVX512-NEXT:    kshiftrq $56, %k0, %k0
-; AVX512-NEXT:    vpmovm2q %k0, %zmm2
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
-; AVX512-NEXT:    vpermq %zmm2, %zmm3, %zmm2
-; AVX512-NEXT:    vpmovq2m %zmm2, %k1
+; AVX512-NEXT:    vpmovm2d %k0, %ymm2
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512-NEXT:    vpmovd2m %ymm2, %k1
 ; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
 ; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
 ; AVX512-NEXT:    vzeroupper
@@ -568,11 +571,12 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrq $56, %k0, %k1
-; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512NOTDQ-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
-; AVX512NOTDQ-NEXT:    vpermq %zmm2, %zmm3, %zmm2
-; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm2, %zmm2
-; AVX512NOTDQ-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
+; AVX512NOTDQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512NOTDQ-NEXT:    vpslld $31, %ymm2, %ymm2
+; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
 ; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
 ; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
 ; AVX512NOTDQ-NEXT:    vzeroupper
@@ -1054,9 +1058,9 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovd (%rdi), %k0
 ; AVX512-NEXT:    kshiftrd $16, %k0, %k0
-; AVX512-NEXT:    vpmovm2q %k0, %zmm0
-; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovq2m %zmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX512-NEXT:    vpmovd2m %ymm0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1065,10 +1069,11 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
-; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %zmm0
-; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512NOTDQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX512NOTDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    vzeroupper
@@ -1159,10 +1164,10 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovd (%rdi), %k0
 ; AVX512-NEXT:    kshiftrd $24, %k0, %k0
-; AVX512-NEXT:    vpmovm2q %k0, %zmm0
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
-; AVX512-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; AVX512-NEXT:    vpmovq2m %zmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-NEXT:    vpmovd2m %ymm0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1171,11 +1176,12 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrd $24, %k0, %k1
-; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512NOTDQ-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
-; AVX512NOTDQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512NOTDQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
+; AVX512NOTDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512NOTDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    vzeroupper
@@ -1266,9 +1272,9 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovq (%rdi), %k0
 ; AVX512-NEXT:    kshiftrq $32, %k0, %k0
-; AVX512-NEXT:    vpmovm2q %k0, %zmm0
-; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovq2m %zmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX512-NEXT:    vpmovd2m %ymm0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1277,10 +1283,11 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %zmm0
-; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512NOTDQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX512NOTDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    vzeroupper
@@ -1399,10 +1406,10 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovq (%rdi), %k0
 ; AVX512-NEXT:    kshiftrq $56, %k0, %k0
-; AVX512-NEXT:    vpmovm2q %k0, %zmm0
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
-; AVX512-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; AVX512-NEXT:    vpmovq2m %zmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-NEXT:    vpmovd2m %ymm0, %k0
 ; AVX512-NEXT:    kmovb %k0, (%rsi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1411,11 +1418,12 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
 ; AVX512NOTDQ-NEXT:    kshiftrq $56, %k0, %k1
-; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512NOTDQ-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
-; AVX512NOTDQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512NOTDQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
+; AVX512NOTDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512NOTDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
 ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
 ; AVX512NOTDQ-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 7e0b981b2c6a..7477e05f0c7f 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -793,11 +793,10 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; KNL-NEXT:    cmpl %esi, %edi
 ; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k1
-; KNL-NEXT:    movl {{.*}}(%rip), %ecx
-; KNL-NEXT:    vpbroadcastd %ecx, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k1
-; KNL-NEXT:    vpbroadcastd %ecx, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
@@ -1432,8 +1431,7 @@ define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
 ; CHECK-NEXT:    ## kill: def %edi killed %edi def %rdi
 ; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    andl $15, %edi
-; CHECK-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movb (%rdi,%rax), %al
+; CHECK-NEXT:    movb -24(%rsp,%rdi), %al
 ; CHECK-NEXT:    retq
   %t2 = extractelement <16 x i8> %t1, i32 %index
   ret i8 %t2
@@ -1452,8 +1450,7 @@ define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
 ; CHECK-NEXT:    ## kill: def %edi killed %edi def %rdi
 ; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
 ; CHECK-NEXT:    andl $31, %edi
-; CHECK-NEXT:    movq %rsp, %rax
-; CHECK-NEXT:    movb (%rdi,%rax), %al
+; CHECK-NEXT:    movb (%rsp,%rdi), %al
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    vzeroupper
@@ -1477,8 +1474,7 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
 ; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovaps %ymm0, (%rsp)
 ; KNL-NEXT:    andl $63, %edi
-; KNL-NEXT:    movq %rsp, %rax
-; KNL-NEXT:    movb (%rdi,%rax), %al
+; KNL-NEXT:    movb (%rsp,%rdi), %al
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    vzeroupper
@@ -1496,8 +1492,7 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
 ; SKX-NEXT:    ## kill: def %edi killed %edi def %rdi
 ; SKX-NEXT:    vmovaps %zmm0, (%rsp)
 ; SKX-NEXT:    andl $63, %edi
-; SKX-NEXT:    movq %rsp, %rax
-; SKX-NEXT:    movb (%rdi,%rax), %al
+; SKX-NEXT:    movb (%rsp,%rdi), %al
 ; SKX-NEXT:    movq %rbp, %rsp
 ; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    vzeroupper
@@ -1522,8 +1517,7 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index)
 ; KNL-NEXT:    vmovaps %ymm0, (%rsp)
 ; KNL-NEXT:    movzbl %dil, %eax
 ; KNL-NEXT:    andl $63, %eax
-; KNL-NEXT:    movq %rsp, %rcx
-; KNL-NEXT:    movb (%rax,%rcx), %al
+; KNL-NEXT:    movb (%rsp,%rax), %al
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    vzeroupper
@@ -1542,8 +1536,7 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index)
 ; SKX-NEXT:    vmovaps %zmm0, (%rsp)
 ; SKX-NEXT:    movzbl %dil, %eax
 ; SKX-NEXT:    andl $63, %eax
-; SKX-NEXT:    movq %rsp, %rcx
-; SKX-NEXT:    movb (%rax,%rcx), %al
+; SKX-NEXT:    movb (%rsp,%rax), %al
 ; SKX-NEXT:    movq %rbp, %rsp
 ; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    vzeroupper
@@ -1617,45 +1610,28 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b,
 define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {
 ; KNL-LABEL: test_extractelement_varible_v8i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:    .cfi_offset %rbp, -16
-; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:    .cfi_def_cfa_register %rbp
-; KNL-NEXT:    andq $-64, %rsp
-; KNL-NEXT:    subq $128, %rsp
 ; KNL-NEXT:    ## kill: def %edi killed %edi def %rdi
 ; KNL-NEXT:    ## kill: def %ymm1 killed %ymm1 def %zmm1
 ; KNL-NEXT:    ## kill: def %ymm0 killed %ymm0 def %zmm0
 ; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vmovdqa64 %zmm0, (%rsp)
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    andl $7, %edi
-; KNL-NEXT:    movzbl (%rsp,%rdi,8), %eax
+; KNL-NEXT:    movzbl -24(%rsp,%rdi,2), %eax
 ; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movq %rbp, %rsp
-; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_varible_v8i1:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    pushq %rbp
-; SKX-NEXT:    .cfi_def_cfa_offset 16
-; SKX-NEXT:    .cfi_offset %rbp, -16
-; SKX-NEXT:    movq %rsp, %rbp
-; SKX-NEXT:    .cfi_def_cfa_register %rbp
-; SKX-NEXT:    andq $-64, %rsp
-; SKX-NEXT:    subq $128, %rsp
 ; SKX-NEXT:    ## kill: def %edi killed %edi def %rdi
 ; SKX-NEXT:    vpcmpnleud %ymm1, %ymm0, %k0
-; SKX-NEXT:    vpmovm2q %k0, %zmm0
-; SKX-NEXT:    vmovdqa64 %zmm0, (%rsp)
+; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SKX-NEXT:    andl $7, %edi
-; SKX-NEXT:    movzbl (%rsp,%rdi,8), %eax
+; SKX-NEXT:    movzbl -24(%rsp,%rdi,2), %eax
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    movq %rbp, %rsp
-; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <8 x i32> %a, %b
@@ -1667,43 +1643,26 @@ define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b,
 define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {
 ; KNL-LABEL: test_extractelement_varible_v16i1:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:    .cfi_offset %rbp, -16
-; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:    .cfi_def_cfa_register %rbp
-; KNL-NEXT:    andq $-64, %rsp
-; KNL-NEXT:    subq $128, %rsp
 ; KNL-NEXT:    ## kill: def %edi killed %edi def %rdi
 ; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vmovdqa32 %zmm0, (%rsp)
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    andl $15, %edi
-; KNL-NEXT:    movzbl (%rsp,%rdi,4), %eax
+; KNL-NEXT:    movzbl -24(%rsp,%rdi), %eax
 ; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    movq %rbp, %rsp
-; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_varible_v16i1:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    pushq %rbp
-; SKX-NEXT:    .cfi_def_cfa_offset 16
-; SKX-NEXT:    .cfi_offset %rbp, -16
-; SKX-NEXT:    movq %rsp, %rbp
-; SKX-NEXT:    .cfi_def_cfa_register %rbp
-; SKX-NEXT:    andq $-64, %rsp
-; SKX-NEXT:    subq $128, %rsp
 ; SKX-NEXT:    ## kill: def %edi killed %edi def %rdi
 ; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
-; SKX-NEXT:    vpmovm2d %k0, %zmm0
-; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SKX-NEXT:    andl $15, %edi
-; SKX-NEXT:    movzbl (%rsp,%rdi,4), %eax
+; SKX-NEXT:    movzbl -24(%rsp,%rdi), %eax
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    movq %rbp, %rsp
-; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <16 x i32> %a, %b
@@ -1729,8 +1688,7 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,
 ; KNL-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
 ; KNL-NEXT:    andl $31, %edi
-; KNL-NEXT:    movq %rsp, %rax
-; KNL-NEXT:    movzbl (%rdi,%rax), %eax
+; KNL-NEXT:    movzbl (%rsp,%rdi), %eax
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
@@ -1744,14 +1702,14 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,
 ; SKX-NEXT:    .cfi_offset %rbp, -16
 ; SKX-NEXT:    movq %rsp, %rbp
 ; SKX-NEXT:    .cfi_def_cfa_register %rbp
-; SKX-NEXT:    andq $-64, %rsp
-; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
 ; SKX-NEXT:    ## kill: def %edi killed %edi def %rdi
 ; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0
-; SKX-NEXT:    vpmovm2w %k0, %zmm0
-; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT:    vpmovm2b %k0, %ymm0
+; SKX-NEXT:    vmovdqa %ymm0, (%rsp)
 ; SKX-NEXT:    andl $31, %edi
-; SKX-NEXT:    movzbl (%rsp,%rdi,2), %eax
+; SKX-NEXT:    movzbl (%rsp,%rdi), %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    movq %rbp, %rsp
 ; SKX-NEXT:    popq %rbp
@@ -1792,8 +1750,7 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
 ; KNL-NEXT:    andl $31, %esi
 ; KNL-NEXT:    testb %dil, %dil
 ; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; KNL-NEXT:    setne (%rsi,%rax)
+; KNL-NEXT:    setne 32(%rsp,%rsi)
 ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -1817,20 +1774,18 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
 ; SKX-NEXT:    .cfi_offset %rbp, -16
 ; SKX-NEXT:    movq %rsp, %rbp
 ; SKX-NEXT:    .cfi_def_cfa_register %rbp
-; SKX-NEXT:    andq $-64, %rsp
-; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
 ; SKX-NEXT:    ## kill: def %esi killed %esi def %rsi
 ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k1
-; SKX-NEXT:    xorl %eax, %eax
-; SKX-NEXT:    testb %dil, %dil
-; SKX-NEXT:    setne %al
-; SKX-NEXT:    vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
-; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0
 ; SKX-NEXT:    andl $31, %esi
-; SKX-NEXT:    movw %ax, (%rsp,%rsi,2)
-; SKX-NEXT:    vpsllw $15, (%rsp), %zmm0
-; SKX-NEXT:    vpmovw2m %zmm0, %k0
+; SKX-NEXT:    testb %dil, %dil
+; SKX-NEXT:    vpmovm2b %k0, %ymm0
+; SKX-NEXT:    vmovdqa %ymm0, (%rsp)
+; SKX-NEXT:    setne (%rsp,%rsi)
+; SKX-NEXT:    vpsllw $7, (%rsp), %ymm0
+; SKX-NEXT:    vpmovb2m %ymm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    movq %rbp, %rsp
 ; SKX-NEXT:    popq %rbp
@@ -1863,8 +1818,7 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
 ; KNL-NEXT:    testb %dil, %dil
 ; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; KNL-NEXT:    setne (%rsi,%rax)
+; KNL-NEXT:    setne 64(%rsp,%rsi)
 ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0
 ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm2
@@ -1905,13 +1859,12 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
 ; SKX-NEXT:    subq $128, %rsp
 ; SKX-NEXT:    ## kill: def %esi killed %esi def %rsi
 ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1
+; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
 ; SKX-NEXT:    andl $63, %esi
 ; SKX-NEXT:    testb %dil, %dil
-; SKX-NEXT:    vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
-; SKX-NEXT:    movq %rsp, %rax
-; SKX-NEXT:    setne (%rsi,%rax)
+; SKX-NEXT:    setne (%rsp,%rsi)
 ; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0
 ; SKX-NEXT:    kmovq %k0, %rax
@@ -2050,8 +2003,7 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
 ; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovdqa %ymm2, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; KNL-NEXT:    setne (%rax,%rcx)
+; KNL-NEXT:    setne 128(%rsp,%rax)
 ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
 ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm2
 ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm3
@@ -2215,18 +2167,16 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
 ; SKX-NEXT:    vpinsrb $15, 728(%rbp), %xmm2, %xmm2
 ; SKX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpcmpnleub %zmm2, %zmm0, %k1
-; SKX-NEXT:    vpcmpnleub %zmm2, %zmm1, %k2
+; SKX-NEXT:    vpcmpnleub %zmm2, %zmm0, %k0
+; SKX-NEXT:    vpcmpnleub %zmm2, %zmm1, %k1
 ; SKX-NEXT:    movl 744(%rbp), %eax
 ; SKX-NEXT:    andl $127, %eax
 ; SKX-NEXT:    cmpb $0, 736(%rbp)
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SKX-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k2} {z}
-; SKX-NEXT:    vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
-; SKX-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    vpmovm2b %k1, %zmm0
+; SKX-NEXT:    vmovdqa32 %zmm0, {{[0-9]+}}(%rsp)
+; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
-; SKX-NEXT:    movq %rsp, %rcx
-; SKX-NEXT:    setne (%rax,%rcx)
+; SKX-NEXT:    setne (%rsp,%rax)
 ; SKX-NEXT:    vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0
 ; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
@@ -2270,8 +2220,7 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
 ; KNL-NEXT:    vmovdqa %ymm2, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; KNL-NEXT:    setne (%rsi,%rax)
+; KNL-NEXT:    setne 128(%rsp,%rsi)
 ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1
 ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm2
 ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm3
@@ -2336,17 +2285,15 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
 ; SKX-NEXT:    subq $256, %rsp ## imm = 0x100
 ; SKX-NEXT:    ## kill: def %esi killed %esi def %rsi
 ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT:    vpcmpnleub %zmm2, %zmm0, %k1
-; SKX-NEXT:    vpcmpnleub %zmm2, %zmm1, %k2
+; SKX-NEXT:    vpcmpnleub %zmm2, %zmm0, %k0
+; SKX-NEXT:    vpcmpnleub %zmm2, %zmm1, %k1
 ; SKX-NEXT:    andl $127, %esi
 ; SKX-NEXT:    testb %dil, %dil
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SKX-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k2} {z}
-; SKX-NEXT:    vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
-; SKX-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    vpmovm2b %k1, %zmm0
+; SKX-NEXT:    vmovdqa32 %zmm0, {{[0-9]+}}(%rsp)
+; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
-; SKX-NEXT:    movq %rsp, %rax
-; SKX-NEXT:    setne (%rsi,%rax)
+; SKX-NEXT:    setne (%rsp,%rsi)
 ; SKX-NEXT:    vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0
 ; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
diff --git a/test/CodeGen/X86/avx512-insert-extract_i1.ll b/test/CodeGen/X86/avx512-insert-extract_i1.ll
index e28e384ae996..9283fd32d746 100644
--- a/test/CodeGen/X86/avx512-insert-extract_i1.ll
+++ b/test/CodeGen/X86/avx512-insert-extract_i1.ll
@@ -18,8 +18,7 @@ define zeroext i8 @test_extractelement_varible_v64i1(<64 x i8> %a, <64 x i8> %b,
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
 ; SKX-NEXT:    andl $63, %edi
-; SKX-NEXT:    movq %rsp, %rax
-; SKX-NEXT:    movzbl (%rdi,%rax), %eax
+; SKX-NEXT:    movzbl (%rsp,%rdi), %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    movq %rbp, %rsp
 ; SKX-NEXT:    popq %rbp
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index dfe42d53483f..4877157d911d 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1783,20 +1783,19 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; KNL-NEXT:    vmovups (%rdi), %zmm2
 ; KNL-NEXT:    vmovups 64(%rdi), %zmm3
 ; KNL-NEXT:    vcmpltps %zmm0, %zmm2, %k1
-; KNL-NEXT:    movl {{.*}}(%rip), %eax
-; KNL-NEXT:    vpbroadcastd %eax, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm2, %xmm2
 ; KNL-NEXT:    vcmpltps %zmm1, %zmm3, %k2
-; KNL-NEXT:    vpbroadcastd %eax, %zmm3 {%k2} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
 ; KNL-NEXT:    vpmovdb %zmm3, %xmm3
 ; KNL-NEXT:    vmovups 68(%rdi), %zmm4 {%k2} {z}
 ; KNL-NEXT:    vmovups 4(%rdi), %zmm5 {%k1} {z}
 ; KNL-NEXT:    vcmpltps %zmm5, %zmm0, %k1
-; KNL-NEXT:    vpbroadcastd %eax, %zmm5 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm5, %xmm5
 ; KNL-NEXT:    vpor %xmm5, %xmm2, %xmm2
 ; KNL-NEXT:    vcmpltps %zmm4, %zmm1, %k1
-; KNL-NEXT:    vpbroadcastd %eax, %zmm4 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm4, %xmm4
 ; KNL-NEXT:    vpor %xmm4, %xmm3, %xmm3
 ; KNL-NEXT:    vpmovsxbd %xmm3, %zmm3
@@ -1886,20 +1885,19 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; AVX512DQ-NEXT:    vmovups (%rdi), %zmm2
 ; AVX512DQ-NEXT:    vmovups 64(%rdi), %zmm3
 ; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm2, %k1
-; AVX512DQ-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm2
 ; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2
 ; AVX512DQ-NEXT:    vcmpltps %zmm1, %zmm3, %k2
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm3 {%k2} {z}
+; AVX512DQ-NEXT:    vpmovm2d %k2, %zmm3
 ; AVX512DQ-NEXT:    vpmovdb %zmm3, %xmm3
 ; AVX512DQ-NEXT:    vmovups 68(%rdi), %zmm4 {%k2} {z}
 ; AVX512DQ-NEXT:    vmovups 4(%rdi), %zmm5 {%k1} {z}
-; AVX512DQ-NEXT:    vcmpltps %zmm5, %zmm0, %k1
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vcmpltps %zmm5, %zmm0, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm5
 ; AVX512DQ-NEXT:    vpmovdb %zmm5, %xmm5
 ; AVX512DQ-NEXT:    vpor %xmm5, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vcmpltps %zmm4, %zmm1, %k1
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vcmpltps %zmm4, %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm4
 ; AVX512DQ-NEXT:    vpmovdb %zmm4, %xmm4
 ; AVX512DQ-NEXT:    vpor %xmm4, %xmm3, %xmm3
 ; AVX512DQ-NEXT:    vpmovsxbd %xmm3, %zmm3
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
index 1b450b98a6d5..78111874b58a 100755
--- a/test/CodeGen/X86/avx512-schedule.ll
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -4376,20 +4376,16 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
 define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
 ; GENERIC-LABEL: trunc_4i32_to_4i1:
 ; GENERIC:       # %bb.0:
+; GENERIC-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
-; GENERIC-NEXT:    vpslld $31, %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vpmovm2d %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT:    vpsrad $31, %xmm0, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: trunc_4i32_to_4i1:
 ; SKX:       # %bb.0:
+; SKX-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpslld $31, %xmm1, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    vpsrad $31, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask_a = trunc <4 x i32>%a to <4 x i1>
   %mask_b = trunc <4 x i32>%b to <4 x i1>
diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index df88f0fca456..0601c011e290 100644
--- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -1488,12 +1488,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x
 define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
-; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
@@ -1503,13 +1501,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12]
+; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT:    vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
@@ -1522,13 +1518,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
@@ -1863,14 +1857,12 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[3,1,2,3,7,5,6,7]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,5,3,2,15,5,7,6]
+; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -1884,14 +1876,12 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp,
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[3,1,2,3,7,5,6,7]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,3,2,4,5,7,6]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6]
+; CHECK-NEXT:    vpermi2d %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %vp
@@ -2298,13 +2288,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[3,1,2,3]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,3,3,1]
+; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2315,13 +2304,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[3,1,2,3]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,3,3,1]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2332,12 +2320,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,1,0,6]
+; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2348,13 +2335,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,1,0,6]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2405,13 +2391,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,0,3,3]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
+; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2422,13 +2407,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,0,3,3]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2585,12 +2569,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,1,2,1]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,1,1,5]
+; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
@@ -2602,14 +2585,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,1,2,1]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,1,1,5]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
@@ -2669,12 +2651,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5],ymm2[6,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,4,6,1]
+; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
@@ -2686,14 +2667,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
@@ -2739,11 +2719,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4
 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,2]
-; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [7,2,3,2]
+; CHECK-NEXT:    vpermi2q %ymm2, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2754,12 +2733,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,1,2,3]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,3,2]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,2,3,2]
+; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -2771,14 +2749,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,2]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
@@ -3307,14 +3284,13 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %v
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = <0,4,u,u,6,1,4,4>
-; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm3
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [0,4,10,11,6,1,4,4]
+; CHECK-NEXT:    vpermi2ps %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpeqps %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vblendmps %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -3325,14 +3301,13 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = <0,4,u,u,6,1,4,4>
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm2
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm3 = xmm2[0,0]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,10,11,6,1,4,4]
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -3775,12 +3750,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>*
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %zmm2
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[2,3],ymm3[4,6],ymm2[6,7]
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [0,10,6,15,4,14,6,15]
+; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3795,12 +3769,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %zmm1
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7]
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15]
+; CHECK-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3815,12 +3788,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>*
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %zmm2
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[0,0],ymm2[6,4],ymm3[4,4]
-; CHECK-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[2,3],ymm2[6,4],ymm3[6,7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [4,14,4,14,4,14,6,7]
+; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vmovaps %xmm4, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -3835,12 +3807,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps (%rdi), %zmm1
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4]
-; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,3],ymm1[6,4],ymm2[6,7]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7]
+; CHECK-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm3
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index 1182bbf94ec5..6bee0de181ab 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -30,10 +30,9 @@ define <8 x i1> @test2(<2 x i1> %a) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k0
-; CHECK-NEXT:    vpmovm2q %k0, %zmm0
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpmovq2m %zmm0, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %ymm0
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; CHECK-NEXT:    vpmovd2m %ymm0, %k0
 ; CHECK-NEXT:    vpmovm2w %k0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index fc684e54b063..826a4538f3f1 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -192,15 +192,14 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $32, %rsp
 ; KNL-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
-; KNL-NEXT:    movl {{.*}}(%rip), %eax
-; KNL-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
-; KNL-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index 9863a0a7d283..5f4b050b863d 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -101,7 +101,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -147,7 +147,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -282,7 +282,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -333,7 +333,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -1354,7 +1354,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -1401,7 +1401,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -1543,7 +1543,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -1595,7 +1595,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -4799,7 +4799,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -4839,7 +4839,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
 ; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -4882,7 +4882,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -4926,7 +4926,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -4970,7 +4970,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_
 ; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -5014,7 +5014,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u
 ; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -5061,7 +5061,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -5106,7 +5106,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -5154,7 +5154,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -5203,7 +5203,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -5252,7 +5252,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -5301,7 +5301,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -9211,7 +9211,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -9257,7 +9257,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -9392,7 +9392,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -9443,7 +9443,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -10464,7 +10464,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -10511,7 +10511,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -10653,7 +10653,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -10705,7 +10705,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -13909,7 +13909,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -13949,7 +13949,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -13992,7 +13992,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14036,7 +14036,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14080,7 +14080,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
 ; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14124,7 +14124,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
 ; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14171,7 +14171,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14216,7 +14216,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14264,7 +14264,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14313,7 +14313,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14362,7 +14362,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14411,7 +14411,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -18328,7 +18328,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -18377,7 +18377,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -18519,7 +18519,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -18573,7 +18573,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -19641,7 +19641,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -19691,7 +19691,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -19840,7 +19840,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -19895,7 +19895,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23147,7 +23147,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23187,7 +23187,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    vpcmpnltd (%rdi), %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23230,7 +23230,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23274,7 +23274,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23318,7 +23318,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
 ; NoVLX-NEXT:    vpcmpnltd (%rdi){1to16}, %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23362,7 +23362,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__
 ; NoVLX-NEXT:    vpcmpnltd (%rsi){1to16}, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23409,7 +23409,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23454,7 +23454,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23502,7 +23502,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23551,7 +23551,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23600,7 +23600,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -23649,7 +23649,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -27664,7 +27664,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -27713,7 +27713,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -27857,7 +27857,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -27911,7 +27911,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -28989,7 +28989,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -29039,7 +29039,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -29190,7 +29190,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -29245,7 +29245,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32541,7 +32541,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32581,7 +32581,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    vpcmpltud (%rdi), %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32624,7 +32624,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32668,7 +32668,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32712,7 +32712,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
 ; NoVLX-NEXT:    vpcmpltud (%rdi){1to16}, %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32756,7 +32756,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
 ; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32803,7 +32803,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32848,7 +32848,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32896,7 +32896,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32945,7 +32945,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -32994,7 +32994,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -33043,7 +33043,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39363,7 +39363,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39403,7 +39403,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39444,7 +39444,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float*
 ; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39488,7 +39488,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39532,7 +39532,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39577,7 +39577,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__
 ; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k1 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39666,7 +39666,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39711,7 +39711,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39757,7 +39757,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39806,7 +39806,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39855,7 +39855,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -39905,7 +39905,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -43827,7 +43827,7 @@ define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) {
 ; NoVLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k1
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll
index 2276e5634537..78c44e4dca3b 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -30,13 +30,13 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
 ; AVX512F-LABEL: v8i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k1
 ; AVX512F-NEXT:    vpcmpgtw %xmm3, %xmm2, %xmm0
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0 {%k1}
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    # kill: def %al killed %al killed %eax
 ; AVX512F-NEXT:    vzeroupper
@@ -943,12 +943,12 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 ; AVX512F-NEXT:    vpsllw $8, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsraw $8, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpmovsxwq %xmm2, %zmm0
-; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512F-NEXT:    vpmovsxwd %xmm2, %ymm0
+; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0 {%k1}
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    # kill: def %al killed %al killed %eax
 ; AVX512F-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
index 7d0381837b70..8fdacb7b79d6 100644
--- a/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -26,9 +26,9 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512F-LABEL: v8i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    # kill: def %al killed %al killed %eax
 ; AVX512F-NEXT:    vzeroupper
@@ -640,9 +640,9 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; AVX512F-NEXT:    vpsllw $8, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpsraw $8, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    # kill: def %al killed %al killed %eax
 ; AVX512F-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll
index e92237f524f5..c2da74a1646c 100644
--- a/test/CodeGen/X86/combine-and.ll
+++ b/test/CodeGen/X86/combine-and.ll
@@ -220,6 +220,16 @@ define <4 x i32> @and_or_v4i32(<4 x i32> %a0) {
   ret <4 x i32> %2
 }
 
+define <8 x i16> @and_or_v8i16(<8 x i16> %a0) {
+; CHECK-LABEL: and_or_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [15,7,3,1,14,10,2,32767]
+; CHECK-NEXT:    retq
+  %1 = or <8 x i16> %a0, <i16 255, i16 127, i16 63, i16 31, i16 15, i16 31, i16 63, i16 -1>
+  %2 = and <8 x i16> %1, <i16 15, i16 7, i16 3, i16 1, i16 14, i16 10, i16 2, i16 32767>
+  ret <8 x i16> %2
+}
+
 ;
 ; known bits folding
 ;
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index 1601c67dce25..65184c4d278e 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -430,25 +430,35 @@ define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
   ret <4 x i32> %or
 }
 
-; TODO: Why would we do this?
-; (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
+; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) iff (c1 & c2) != 0
 
 define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
 ; CHECK-LABEL: or_and_v2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    orps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %1 = and <2 x i64> %a0, <i64 7, i64 7>
   %2 = or <2 x i64> %1, <i64 3, i64 3>
   ret <2 x i64> %2
 }
 
-; If all masked bits are going to be set, that's a constant fold.
-
 define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
 ; CHECK-LABEL: or_and_v4i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    orps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
+  %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2>
+  ret <4 x i32> %2
+}
+
+; If all masked bits are going to be set, that's a constant fold.
+
+define <4 x i32> @or_and_v4i32_fold(<4 x i32> %a0) {
+; CHECK-LABEL: or_and_v4i32_fold:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [3,3,3,3]
 ; CHECK-NEXT:    retq
   %1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
diff --git a/test/CodeGen/X86/darwin-bzero.ll b/test/CodeGen/X86/darwin-bzero.ll
index 3099526028ab..410d67ff0ec1 100644
--- a/test/CodeGen/X86/darwin-bzero.ll
+++ b/test/CodeGen/X86/darwin-bzero.ll
@@ -1,7 +1,10 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep __bzero
+; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
 
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
 
+; CHECK-LABEL: foo:
+; CHECK: {{calll|callq}} ___bzero
 define void @foo(i8* %p, i32 %len) {
   call void @llvm.memset.p0i8.i32(i8* %p, i8 0, i32 %len, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index 4d24a15fe2e1..66bdfb8475f1 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -403,16 +403,14 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    andl $15, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movb (%rdi,%rax), %al
+; SSE-NEXT:    movb -24(%rsp,%rdi), %al
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extractelement_v16i8_var:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    andl $15, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movb (%rdi,%rax), %al
+; AVX-NEXT:    movb -24(%rsp,%rdi), %al
 ; AVX-NEXT:    retq
   %b = extractelement <16 x i8> %a, i256 %i
   ret i8 %b
@@ -428,8 +426,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
 ; SSE-NEXT:    andl $31, %edi
 ; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, (%rsp)
-; SSE-NEXT:    movq %rsp, %rax
-; SSE-NEXT:    movb (%rdi,%rax), %al
+; SSE-NEXT:    movb (%rsp,%rdi), %al
 ; SSE-NEXT:    movq %rbp, %rsp
 ; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
@@ -442,8 +439,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
 ; AVX-NEXT:    subq $64, %rsp
 ; AVX-NEXT:    andl $31, %edi
 ; AVX-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX-NEXT:    movq %rsp, %rax
-; AVX-NEXT:    movb (%rdi,%rax), %al
+; AVX-NEXT:    movb (%rsp,%rdi), %al
 ; AVX-NEXT:    movq %rbp, %rsp
 ; AVX-NEXT:    popq %rbp
 ; AVX-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index 8dacf2dcf971..a0e919d128df 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -140,21 +140,23 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x doubl
 define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
 ; SKX-LABEL: test11:
 ; SKX:       # %bb.0: # %entry
-; SKX-NEXT:    vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm0
+; SKX-NEXT:    vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT:    vmovaps %xmm2, %xmm0
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test11:
 ; KNL:       # %bb.0: # %entry
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-0,-0,-0,-0]
-; KNL-NEXT:    vxorps %xmm0, %xmm2, %xmm0
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
+; KNL-NEXT:    vxorps %xmm3, %xmm2, %xmm2
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT:    vmovaps %xmm2, %xmm0
 ; KNL-NEXT:    retq
 entry:
   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
-  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
   ret <4 x float> %0
 }
 
@@ -164,19 +166,17 @@ define <4 x float> @test11b(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 z
 ; SKX-LABEL: test11b:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm1 {%k1}
-; SKX-NEXT:    vmovaps %xmm1, %xmm0
+; SKX-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test11b:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm1 {%k1}
-; KNL-NEXT:    vmovaps %xmm1, %xmm0
+; KNL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
 ; KNL-NEXT:    retq
 entry:
   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
-  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
   ret <4 x float> %0
 }
 
@@ -305,3 +305,147 @@ define <8 x double> @test17(<8 x double> %a, <8 x double> %b, <8 x double> %c, i
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <4 x float> @test18(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test18:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test18:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    retq
+entry:
+  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 4) #10
+  ret <4 x float> %0
+}
+
+define <4 x float> @test19(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test19:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test19:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    retq
+entry:
+  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %sub.i.2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %sub.i.2, i8 %mask, i32 4) #10
+  ret <4 x float> %0
+}
+
+define <4 x float> @test20(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test20:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT:    vmovaps %xmm2, %xmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test20:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT:    vmovaps %xmm2, %xmm0
+; KNL-NEXT:    retq
+entry:
+  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 4) #10
+  ret <4 x float> %0
+}
+
+define <4 x float> @test21(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test21:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test21:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    retq
+entry:
+  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 8) #10
+  ret <4 x float> %0
+}
+
+define <4 x float> @test22(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test22:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test22:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    retq
+entry:
+  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %sub.i.2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %sub.i.2, i8 %mask, i32 8) #10
+  ret <4 x float> %0
+}
+
+define <4 x float> @test23(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test23:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT:    vmovaps %xmm2, %xmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test23:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT:    vmovaps %xmm2, %xmm0
+; KNL-NEXT:    retq
+entry:
+  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 8) #10
+  ret <4 x float> %0
+}
+
+define <4 x float> @test24(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test24:
+; SKX:       # %bb.0: # %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test24:
+; KNL:       # %bb.0: # %entry
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    retq
+entry:
+  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 8) #10
+  ret <4 x float> %0
+}
+
+define <16 x float> @test25(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
+; CHECK-LABEL: test25:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %sub.i.2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %sub.i, <16 x float> %sub.i.2, i16 -1, i32 8) #2
+  ret <16 x float> %0
+}
diff --git a/test/CodeGen/X86/fmsubadd-combine.ll b/test/CodeGen/X86/fmsubadd-combine.ll
index 814d61e22382..ca2c61a88507 100644
--- a/test/CodeGen/X86/fmsubadd-combine.ll
+++ b/test/CodeGen/X86/fmsubadd-combine.ll
@@ -8,26 +8,17 @@
 define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
 ; FMA3_256-LABEL: mul_subadd_pd128:
 ; FMA3_256:       # %bb.0: # %entry
-; FMA3_256-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
-; FMA3_256-NEXT:    vsubpd %xmm2, %xmm0, %xmm1
-; FMA3_256-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
-; FMA3_256-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; FMA3_256-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0
 ; FMA3_256-NEXT:    retq
 ;
 ; FMA3_512-LABEL: mul_subadd_pd128:
 ; FMA3_512:       # %bb.0: # %entry
-; FMA3_512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
-; FMA3_512-NEXT:    vsubpd %xmm2, %xmm0, %xmm1
-; FMA3_512-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
-; FMA3_512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; FMA3_512-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0
 ; FMA3_512-NEXT:    retq
 ;
 ; FMA4-LABEL: mul_subadd_pd128:
 ; FMA4:       # %bb.0: # %entry
-; FMA4-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
-; FMA4-NEXT:    vsubpd %xmm2, %xmm0, %xmm1
-; FMA4-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
-; FMA4-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; FMA4-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 entry:
   %AB = fmul <2 x double> %A, %B
@@ -40,18 +31,12 @@ entry:
 define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
 ; FMA3-LABEL: mul_subadd_ps128:
 ; FMA3:       # %bb.0: # %entry
-; FMA3-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA3-NEXT:    vsubps %xmm2, %xmm0, %xmm1
-; FMA3-NEXT:    vaddps %xmm2, %xmm0, %xmm0
-; FMA3-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; FMA3-NEXT:    vfmsubadd213ps  %xmm2, %xmm1, %xmm0
 ; FMA3-NEXT:    retq
 ;
 ; FMA4-LABEL: mul_subadd_ps128:
 ; FMA4:       # %bb.0: # %entry
-; FMA4-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; FMA4-NEXT:    vsubps %xmm2, %xmm0, %xmm1
-; FMA4-NEXT:    vaddps %xmm2, %xmm0, %xmm0
-; FMA4-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; FMA4-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 entry:
   %AB = fmul <4 x float> %A, %B
@@ -64,18 +49,12 @@ entry:
 define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
 ; FMA3-LABEL: mul_subadd_pd256:
 ; FMA3:       # %bb.0: # %entry
-; FMA3-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; FMA3-NEXT:    vsubpd %ymm2, %ymm0, %ymm1
-; FMA3-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
-; FMA3-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; FMA3-NEXT:    vfmsubadd213pd  %ymm2, %ymm1, %ymm0
 ; FMA3-NEXT:    retq
 ;
 ; FMA4-LABEL: mul_subadd_pd256:
 ; FMA4:       # %bb.0: # %entry
-; FMA4-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; FMA4-NEXT:    vsubpd %ymm2, %ymm0, %ymm1
-; FMA4-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; FMA4-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
 ; FMA4-NEXT:    retq
 entry:
   %AB = fmul <4 x double> %A, %B
@@ -88,18 +67,12 @@ entry:
 define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
 ; FMA3-LABEL: mul_subadd_ps256:
 ; FMA3:       # %bb.0: # %entry
-; FMA3-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; FMA3-NEXT:    vsubps %ymm2, %ymm0, %ymm1
-; FMA3-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; FMA3-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; FMA3-NEXT:    vfmsubadd213ps  %ymm2, %ymm1, %ymm0
 ; FMA3-NEXT:    retq
 ;
 ; FMA4-LABEL: mul_subadd_ps256:
 ; FMA4:       # %bb.0: # %entry
-; FMA4-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; FMA4-NEXT:    vsubps %ymm2, %ymm0, %ymm1
-; FMA4-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; FMA4-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
 ; FMA4-NEXT:    retq
 entry:
   %AB = fmul <8 x float> %A, %B
@@ -112,34 +85,19 @@ entry:
 define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
 ; FMA3_256-LABEL: mul_subadd_pd512:
 ; FMA3_256:       # %bb.0: # %entry
-; FMA3_256-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
-; FMA3_256-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
-; FMA3_256-NEXT:    vsubpd %ymm5, %ymm1, %ymm2
-; FMA3_256-NEXT:    vsubpd %ymm4, %ymm0, %ymm3
-; FMA3_256-NEXT:    vaddpd %ymm5, %ymm1, %ymm1
-; FMA3_256-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
-; FMA3_256-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
-; FMA3_256-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
+; FMA3_256-NEXT:    vfmsubadd213pd  %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT:    vfmsubadd213pd  %ymm5, %ymm3, %ymm1
 ; FMA3_256-NEXT:    retq
 ;
 ; FMA3_512-LABEL: mul_subadd_pd512:
 ; FMA3_512:       # %bb.0: # %entry
-; FMA3_512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; FMA3_512-NEXT:    vsubpd %zmm2, %zmm0, %zmm1
-; FMA3_512-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
-; FMA3_512-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[7]
+; FMA3_512-NEXT:    vfmsubadd213pd  %zmm2, %zmm1, %zmm0
 ; FMA3_512-NEXT:    retq
 ;
 ; FMA4-LABEL: mul_subadd_pd512:
 ; FMA4:       # %bb.0: # %entry
-; FMA4-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
-; FMA4-NEXT:    vsubpd %ymm5, %ymm1, %ymm2
-; FMA4-NEXT:    vsubpd %ymm4, %ymm0, %ymm3
-; FMA4-NEXT:    vaddpd %ymm5, %ymm1, %ymm1
-; FMA4-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
-; FMA4-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
+; FMA4-NEXT:    vfmsubaddpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmsubaddpd %ymm5, %ymm3, %ymm1, %ymm1
 ; FMA4-NEXT:    retq
 entry:
   %AB = fmul <8 x double> %A, %B
@@ -152,35 +110,19 @@ entry:
 define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
 ; FMA3_256-LABEL: mul_subadd_ps512:
 ; FMA3_256:       # %bb.0: # %entry
-; FMA3_256-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA3_256-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA3_256-NEXT:    vsubps %ymm5, %ymm1, %ymm2
-; FMA3_256-NEXT:    vsubps %ymm4, %ymm0, %ymm3
-; FMA3_256-NEXT:    vaddps %ymm5, %ymm1, %ymm1
-; FMA3_256-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; FMA3_256-NEXT:    vaddps %ymm4, %ymm0, %ymm0
-; FMA3_256-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
+; FMA3_256-NEXT:    vfmsubadd213ps  %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT:    vfmsubadd213ps  %ymm5, %ymm3, %ymm1
 ; FMA3_256-NEXT:    retq
 ;
 ; FMA3_512-LABEL: mul_subadd_ps512:
 ; FMA3_512:       # %bb.0: # %entry
-; FMA3_512-NEXT:    vmulps %zmm1, %zmm0, %zmm1
-; FMA3_512-NEXT:    vaddps %zmm2, %zmm1, %zmm0
-; FMA3_512-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; FMA3_512-NEXT:    kmovw %eax, %k1
-; FMA3_512-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1}
+; FMA3_512-NEXT:    vfmsubadd213ps  %zmm2, %zmm1, %zmm0
 ; FMA3_512-NEXT:    retq
 ;
 ; FMA4-LABEL: mul_subadd_ps512:
 ; FMA4:       # %bb.0: # %entry
-; FMA4-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vmulps %ymm3, %ymm1, %ymm1
-; FMA4-NEXT:    vsubps %ymm5, %ymm1, %ymm2
-; FMA4-NEXT:    vsubps %ymm4, %ymm0, %ymm3
-; FMA4-NEXT:    vaddps %ymm5, %ymm1, %ymm1
-; FMA4-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; FMA4-NEXT:    vaddps %ymm4, %ymm0, %ymm0
-; FMA4-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
+; FMA4-NEXT:    vfmsubaddps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfmsubaddps %ymm5, %ymm3, %ymm1, %ymm1
 ; FMA4-NEXT:    retq
 entry:
   %AB = fmul <16 x float> %A, %B
diff --git a/test/CodeGen/X86/fold-vector-sext-crash.ll b/test/CodeGen/X86/fold-vector-sext-crash.ll
index 481f55e9e10d..db73195698e3 100644
--- a/test/CodeGen/X86/fold-vector-sext-crash.ll
+++ b/test/CodeGen/X86/fold-vector-sext-crash.ll
@@ -9,9 +9,9 @@
 define <4 x i64> @foo(<4 x i64> %A) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm1
-; CHECK-NEXT:    vandps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295,0,0,0,0]
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
 ; CHECK-NEXT:    retl
   %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i64> %A, <4 x i64><i64 undef, i64 undef, i64 0, i64 0>
   ret <4 x i64> %1
diff --git a/test/CodeGen/X86/horizontal-reduce-smax.ll b/test/CodeGen/X86/horizontal-reduce-smax.ll
index a54e01d9af67..fa92158ae92d 100644
--- a/test/CodeGen/X86/horizontal-reduce-smax.ll
+++ b/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -309,30 +309,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ;
 ; X86-SSE42-LABEL: test_reduce_v16i8:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v16i8:
 ; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX-NEXT:    retl
@@ -371,30 +366,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ;
 ; X64-SSE42-LABEL: test_reduce_v16i8:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v16i8:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX-NEXT:    retq
@@ -906,16 +896,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v32i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -924,14 +911,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -940,15 +925,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-AVX2-LABEL: test_reduce_v32i8:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -994,16 +977,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v32i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1012,14 +992,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1028,15 +1006,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX2-LABEL: test_reduce_v32i8:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1045,15 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX512-LABEL: test_reduce_v32i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1743,16 +1717,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE42-NEXT:    pmaxsb %xmm3, %xmm1
 ; X86-SSE42-NEXT:    pmaxsb %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -1764,14 +1735,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1781,15 +1750,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1847,16 +1814,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-SSE42-NEXT:    pmaxsb %xmm3, %xmm1
 ; X64-SSE42-NEXT:    pmaxsb %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1868,14 +1832,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1885,15 +1847,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1902,17 +1862,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX512-LABEL: test_reduce_v64i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/horizontal-reduce-smin.ll b/test/CodeGen/X86/horizontal-reduce-smin.ll
index f03e745598e6..fa5828a45700 100644
--- a/test/CodeGen/X86/horizontal-reduce-smin.ll
+++ b/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -311,30 +311,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ;
 ; X86-SSE42-LABEL: test_reduce_v16i8:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v16i8:
 ; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX-NEXT:    retl
@@ -373,30 +368,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ;
 ; X64-SSE42-LABEL: test_reduce_v16i8:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v16i8:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX-NEXT:    retq
@@ -910,16 +900,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v32i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -928,14 +915,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -944,15 +929,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-AVX2-LABEL: test_reduce_v32i8:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -998,16 +981,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v32i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1016,14 +996,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1032,15 +1010,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX2-LABEL: test_reduce_v32i8:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1049,15 +1025,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX512-LABEL: test_reduce_v32i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1745,16 +1719,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE42-NEXT:    pminsb %xmm3, %xmm1
 ; X86-SSE42-NEXT:    pminsb %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -1766,14 +1737,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1783,15 +1752,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1849,16 +1816,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-SSE42-NEXT:    pminsb %xmm3, %xmm1
 ; X64-SSE42-NEXT:    pminsb %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1870,14 +1834,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1887,15 +1849,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1904,17 +1864,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX512-LABEL: test_reduce_v64i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll
index 52e623b82718..204479976e90 100644
--- a/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -362,30 +362,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ;
 ; X86-SSE42-LABEL: test_reduce_v16i8:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v16i8:
 ; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX-NEXT:    retl
@@ -408,30 +403,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ;
 ; X64-SSE42-LABEL: test_reduce_v16i8:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v16i8:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX-NEXT:    retq
@@ -1031,16 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v32i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -1049,14 +1036,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1065,15 +1050,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-AVX2-LABEL: test_reduce_v32i8:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1099,16 +1082,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v32i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1117,14 +1097,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1133,15 +1111,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX2-LABEL: test_reduce_v32i8:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1150,15 +1126,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX512-LABEL: test_reduce_v32i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1992,16 +1966,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE42-NEXT:    pmaxub %xmm3, %xmm1
 ; X86-SSE42-NEXT:    pmaxub %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -2013,14 +1984,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -2030,15 +1999,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -2068,16 +2035,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-SSE42-NEXT:    pmaxub %xmm3, %xmm1
 ; X64-SSE42-NEXT:    pmaxub %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -2089,14 +2053,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -2106,15 +2068,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -2123,17 +2083,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX512-LABEL: test_reduce_v64i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll
index 505663656a3a..2a37d17365be 100644
--- a/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -352,30 +352,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ;
 ; X86-SSE42-LABEL: test_reduce_v16i8:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    psrlw $8, %xmm1
 ; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v16i8:
 ; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX-NEXT:    retl
@@ -398,30 +387,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ;
 ; X64-SSE42-LABEL: test_reduce_v16i8:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    psrlw $8, %xmm1
 ; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v16i8:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX-NEXT:    retq
@@ -1004,16 +982,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v32i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    psrlw $8, %xmm1
 ; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -1022,14 +994,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1038,15 +1005,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-AVX2-LABEL: test_reduce_v32i8:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1072,16 +1034,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v32i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    psrlw $8, %xmm1
 ; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1090,14 +1046,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1106,15 +1057,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX2-LABEL: test_reduce_v32i8:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1123,15 +1069,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX512-LABEL: test_reduce_v32i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1942,16 +1883,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE42-NEXT:    pminub %xmm3, %xmm1
 ; X86-SSE42-NEXT:    pminub %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    psrlw $8, %xmm1
 ; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -1963,14 +1898,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1980,15 +1910,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -2018,16 +1943,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-SSE42-NEXT:    pminub %xmm3, %xmm1
 ; X64-SSE42-NEXT:    pminub %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    psrlw $8, %xmm1
 ; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -2039,14 +1958,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -2056,15 +1970,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -2073,17 +1982,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX512-LABEL: test_reduce_v64i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/known-bits-vector.ll b/test/CodeGen/X86/known-bits-vector.ll
index 283d1f93dfb6..46a888f3b9b6 100644
--- a/test/CodeGen/X86/known-bits-vector.ll
+++ b/test/CodeGen/X86/known-bits-vector.ll
@@ -160,17 +160,19 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {
 define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind {
 ; X32-LABEL: knownbits_mask_or_shuffle_uitofp:
 ; X32:       # %bb.0:
-; X32-NEXT:    vandps {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT:    vorps {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X32-NEXT:    vpor {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X32-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; X32-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X32-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: knownbits_mask_or_shuffle_uitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X64-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; X64-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
   %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
diff --git a/test/CodeGen/X86/machinesink-merge-debuginfo.ll b/test/CodeGen/X86/machinesink-merge-debuginfo.ll
index d8fcea1872e8..f5023bbeb5f9 100644
--- a/test/CodeGen/X86/machinesink-merge-debuginfo.ll
+++ b/test/CodeGen/X86/machinesink-merge-debuginfo.ll
@@ -5,6 +5,21 @@ source_filename = "test-sink-debug.cpp"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+; double foo(double x, double y, bool c) {
+;   double a = x / 3.0;
+;   double b = y / 5.0;
+;   double ret;
+;
+;   if (c)
+;      ret = a + 1.0;
+;   else
+;      ret = b + 1.0;
+;
+;   ret = ret + 1.0;
+;
+;   return ret;
+; }
+
 ; Function Attrs: nounwind readnone uwtable
 define double @_Z3fooddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr !dbg !7 {
   tail call void @llvm.dbg.value(metadata double %x, metadata !13, metadata !DIExpression()), !dbg !16
@@ -17,9 +32,10 @@ first:
   %e = fadd double %a, 1.000000e+00
   br label %final
 second:
-  %f = fadd double %b, 1.000000e+00, !dbg !17
-; CHECK:  debug-location !17
-; CHECK:  debug-location !17
+; CHECK-NOT:  debug-location !17
+; CHECK:  debug-location !18
+; CHECK-NOT:  debug-location !17
+  %f = fadd double %b, 1.000000e+00, !dbg !18
   br label %final
 final:
   %cond = phi double [%e, %first], [%f, %second]
@@ -27,15 +43,39 @@ final:
   ret double %d
 }
 
-; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
 
-attributes #1 = { nounwind readnone speculatable }
+
+; Function Attrs: nounwind readnone uwtable
+define double @_Z4foo1ddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr !dbg !19 {
+  tail call void @llvm.dbg.value(metadata double %x, metadata !21, metadata !DIExpression()), !dbg !24
+  tail call void @llvm.dbg.value(metadata double %y, metadata !22, metadata !DIExpression()), !dbg !24
+  tail call void @llvm.dbg.value(metadata i1 %c, metadata !23, metadata !DIExpression()), !dbg !24
+  %a = fdiv double %x, 3.000000e+00
+  %b = fdiv double %y, 5.000000e+00, !dbg !25
+  br i1 %c, label %first, label %second
+first:
+  %e = fadd double %a, 1.000000e+00
+  br label %final
+second:
+  %f = fadd double %b, 1.000000e+00, !dbg !25
+; CHECK:  debug-location !25
+; CHECK-NEXT:  debug-location !25
+  br label %final
+final:
+  %cond = phi double [%e, %first], [%f, %second]
+  %d = fadd double %cond, 1.000000e+00
+  ret double %d
+}
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
 
+attributes #1 = { nounwind readnone speculatable }
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
 !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 313291)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
 !1 = !DIFile(filename: "test-sink-debug.cpp", directory: "/tmp")
 !2 = !{}
@@ -54,3 +94,11 @@ attributes #1 = { nounwind readnone speculatable }
 !15 = !DILocalVariable(name: "c", arg: 3, scope: !7, file: !1, line: 1, type: !11)
 !16 = !DILocation(line: 1, column: 19, scope: !7)
 !17 = !DILocation(line: 2, column: 26, scope: !7)
+!18 = !DILocation(line: 3, column: 20, scope: !7)
+!19 = distinct !DISubprogram(name: "foo1", linkageName: "_Z4foo1ddb", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !20)
+!20 = !{!21, !22, !23}
+!21 = !DILocalVariable(name: "x", arg: 1, scope: !19, file: !1, line: 1, type: !10)
+!22 = !DILocalVariable(name: "y", arg: 2, scope: !19, file: !1, line: 1, type: !10)
+!23 = !DILocalVariable(name: "c", arg: 3, scope: !19, file: !1, line: 1, type: !11)
+!24 = !DILocation(line: 1, column: 19, scope: !19)
+!25 = !DILocation(line: 2, column: 26, scope: !19)
diff --git a/test/CodeGen/X86/machinesink-null-debuginfo.ll b/test/CodeGen/X86/machinesink-null-debuginfo.ll
index 454e0cd704ff..c0399b3cfa81 100644
--- a/test/CodeGen/X86/machinesink-null-debuginfo.ll
+++ b/test/CodeGen/X86/machinesink-null-debuginfo.ll
@@ -11,10 +11,10 @@ define double @_Z3fooddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr
   tail call void @llvm.dbg.value(metadata double %y, metadata !14, metadata !DIExpression()), !dbg !17
   tail call void @llvm.dbg.value(metadata i1 %c, metadata !15, metadata !DIExpression()), !dbg !18
   %a = fdiv double %x, 3.000000e+00
-  %b = fdiv double %y, 5.000000e+00, !dbg !21
+  %b = fdiv double %y, 5.000000e+00, !dbg !19
   %cond = select i1 %c,  double %a, double %b
-; CHECK-NOT: debug-location !21
-  ret double %cond, !dbg !22
+; CHECK-NOT: debug-location !19
+  ret double %cond, !dbg !20
 }
 
 ; Function Attrs: nounwind readnone speculatable
@@ -45,5 +45,5 @@ attributes #1 = { nounwind readnone speculatable }
 !16 = !DILocation(line: 1, column: 19, scope: !7)
 !17 = !DILocation(line: 1, column: 29, scope: !7)
 !18 = !DILocation(line: 1, column: 37, scope: !7)
-!21 = !DILocation(line: 2, column: 26, scope: !7)
-!22 = !DILocation(line: 2, column: 3, scope: !7)
+!19 = !DILocation(line: 2, column: 26, scope: !7)
+!20 = !DILocation(line: 2, column: 3, scope: !7)
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 1eb2631e26ef..d318dde34434 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1500,7 +1500,8 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
 ; KNL_32-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; KNL_32-NEXT:    vpsraq $32, %zmm0, %zmm0
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,1,0]
+; KNL_32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; KNL_32-NEXT:    vmovdqa %xmm2, %xmm2
 ; KNL_32-NEXT:    vpsllq $63, %zmm2, %zmm2
 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
@@ -1596,7 +1597,8 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
 ; KNL_32-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; KNL_32-NEXT:    vpsraq $32, %zmm1, %zmm1
 ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_32-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,1,0]
+; KNL_32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; KNL_32-NEXT:    vmovdqa %xmm2, %xmm2
 ; KNL_32-NEXT:    vpsllq $63, %zmm2, %zmm2
 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
@@ -2606,56 +2608,32 @@ define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <
 define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
 ; KNL_64-LABEL: sext_i8_index:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpmovsxbw %xmm0, %ymm0
-; KNL_64-NEXT:    vpmovsxwq %xmm0, %zmm1
-; KNL_64-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL_64-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL_64-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: sext_i8_index:
 ; KNL_32:       # %bb.0:
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpmovsxbw %xmm0, %ymm0
-; KNL_32-NEXT:    vpmovsxwq %xmm0, %zmm1
-; KNL_32-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL_32-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL_32-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_32-NEXT:    kxnorw %k0, %k0, %k2
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: sext_i8_index:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsxbw %xmm0, %ymm0
-; SKX-NEXT:    vpmovsxwq %xmm0, %zmm1
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; SKX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    kxnorw %k0, %k0, %k2
-; SKX-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; SKX-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; SKX-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: sext_i8_index:
 ; SKX_32:       # %bb.0:
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpmovsxbw %xmm0, %ymm0
-; SKX_32-NEXT:    vpmovsxwq %xmm0, %zmm1
-; SKX_32-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX_32-NEXT:    vpmovsxwq %xmm0, %zmm0
+; SKX_32-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
-; SKX_32-NEXT:    kxnorw %k0, %k0, %k2
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; SKX_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
 ; SKX_32-NEXT:    retl
 
   %sext_ind = sext <16 x i8> %ind to <16 x i64>
@@ -2669,40 +2647,42 @@ define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
 define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
 ; KNL_64-LABEL: sext_v8i8_index:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; KNL_64-NEXT:    vpsllq $56, %zmm0, %zmm0
-; KNL_64-NEXT:    vpsraq $56, %zmm0, %zmm1
+; KNL_64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_64-NEXT:    vpslld $24, %ymm0, %ymm0
+; KNL_64-NEXT:    vpsrad $24, %ymm0, %ymm0
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: sext_v8i8_index:
 ; KNL_32:       # %bb.0:
-; KNL_32-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vpsllq $56, %zmm0, %zmm0
-; KNL_32-NEXT:    vpsraq $56, %zmm0, %zmm1
+; KNL_32-NEXT:    vpslld $24, %ymm0, %ymm0
+; KNL_32-NEXT:    vpsrad $24, %ymm0, %ymm0
 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: sext_v8i8_index:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; SKX-NEXT:    vpsllq $56, %zmm0, %zmm0
-; SKX-NEXT:    vpsraq $56, %zmm0, %zmm1
+; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; SKX-NEXT:    vpslld $24, %ymm0, %ymm0
+; SKX-NEXT:    vpsrad $24, %ymm0, %ymm1
+; SKX-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: sext_v8i8_index:
 ; SKX_32:       # %bb.0:
-; SKX_32-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; SKX_32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vpsllq $56, %zmm0, %zmm0
-; SKX_32-NEXT:    vpsraq $56, %zmm0, %zmm1
+; SKX_32-NEXT:    vpslld $24, %ymm0, %ymm0
+; SKX_32-NEXT:    vpsrad $24, %ymm0, %ymm1
 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
-; SKX_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; SKX_32-NEXT:    vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
 ; SKX_32-NEXT:    retl
 
   %sext_ind = sext <8 x i8> %ind to <8 x i64>
@@ -2764,3 +2744,48 @@ define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32>
 }
 declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
 
+define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
+; KNL_64-LABEL: zext_index:
+; KNL_64:       # %bb.0:
+; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: zext_index:
+; KNL_32:       # %bb.0:
+; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX_SMALL-LABEL: zext_index:
+; SKX_SMALL:       # %bb.0:
+; SKX_SMALL-NEXT:    vandps {{.*}}(%rip){1to16}, %zmm0, %zmm1
+; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; SKX_SMALL-NEXT:    retq
+;
+; SKX_LARGE-LABEL: zext_index:
+; SKX_LARGE:       # %bb.0:
+; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
+; SKX_LARGE-NEXT:    vandps (%rax){1to16}, %zmm0, %zmm1
+; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; SKX_LARGE-NEXT:    retq
+;
+; SKX_32-LABEL: zext_index:
+; SKX_32:       # %bb.0:
+; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; SKX_32-NEXT:    retl
+  %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %sext_ind = zext <16 x i32> %ind_masked to <16 x i64>
+  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
diff --git a/test/CodeGen/X86/popcnt.ll b/test/CodeGen/X86/popcnt.ll
index d7622c8d0cab..f7b9ea9b8b2e 100644
--- a/test/CodeGen/X86/popcnt.ll
+++ b/test/CodeGen/X86/popcnt.ll
@@ -78,7 +78,7 @@ define i16 @cnt16(i16 %x) nounwind readnone {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    shll $8, %eax
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl %ah, %eax
+; X32-NEXT:    movzbl %ah, %eax # NOREX
 ; X32-NEXT:    # kill: def %ax killed %ax killed %eax
 ; X32-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/prefetch.ll b/test/CodeGen/X86/prefetch.ll
index 17a9ac994a79..839948174a43 100644
--- a/test/CodeGen/X86/prefetch.ll
+++ b/test/CodeGen/X86/prefetch.ll
@@ -1,27 +1,101 @@
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s
-; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHW
-; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=SLM
-; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHW
-; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=NOPRFCHW
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prfchw | FileCheck %s -check-prefix=PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=3DNOW
+; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=PRFCHW3DNOW
+
+; Rules:
+; 3dnow by itself get you just the single prefetch instruction with no hints
+; sse provides prefetch0/1/2/nta
+; supporting prefetchw, but not 3dnow implicitly provides prefetcht0/1/2/nta regardless of sse setting as we need something to fall back to for the non-write hint.
+; supporting prefetchwt1 implies prefetcht0/1/2/nta and prefetchw regardless of other settings. this allows levels for non-write and gives us an instruction for write+T0
+; 3dnow prefetch instruction will only get used if you have no other prefetch instructions enabled
 
 ; rdar://10538297
 
 define void @t(i8* %ptr) nounwind  {
+; SSE-LABEL: t:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    prefetcht2 (%eax)
+; SSE-NEXT:    prefetcht1 (%eax)
+; SSE-NEXT:    prefetcht0 (%eax)
+; SSE-NEXT:    prefetchnta (%eax)
+; SSE-NEXT:    prefetcht2 (%eax)
+; SSE-NEXT:    prefetcht1 (%eax)
+; SSE-NEXT:    prefetcht0 (%eax)
+; SSE-NEXT:    prefetchnta (%eax)
+; SSE-NEXT:    retl
+;
+; PRFCHWSSE-LABEL: t:
+; PRFCHWSSE:       # %bb.0: # %entry
+; PRFCHWSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; PRFCHWSSE-NEXT:    prefetcht2 (%eax)
+; PRFCHWSSE-NEXT:    prefetcht1 (%eax)
+; PRFCHWSSE-NEXT:    prefetcht0 (%eax)
+; PRFCHWSSE-NEXT:    prefetchnta (%eax)
+; PRFCHWSSE-NEXT:    prefetchw (%eax)
+; PRFCHWSSE-NEXT:    prefetchw (%eax)
+; PRFCHWSSE-NEXT:    prefetchw (%eax)
+; PRFCHWSSE-NEXT:    prefetchw (%eax)
+; PRFCHWSSE-NEXT:    retl
+;
+; PREFETCHWT1-LABEL: t:
+; PREFETCHWT1:       # %bb.0: # %entry
+; PREFETCHWT1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; PREFETCHWT1-NEXT:    prefetcht2 (%eax)
+; PREFETCHWT1-NEXT:    prefetcht1 (%eax)
+; PREFETCHWT1-NEXT:    prefetcht0 (%eax)
+; PREFETCHWT1-NEXT:    prefetchnta (%eax)
+; PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
+; PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
+; PREFETCHWT1-NEXT:    prefetchw (%eax)
+; PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
+; PREFETCHWT1-NEXT:    retl
+;
+; 3DNOW-LABEL: t:
+; 3DNOW:       # %bb.0: # %entry
+; 3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; 3DNOW-NEXT:    prefetch (%eax)
+; 3DNOW-NEXT:    prefetch (%eax)
+; 3DNOW-NEXT:    prefetch (%eax)
+; 3DNOW-NEXT:    prefetch (%eax)
+; 3DNOW-NEXT:    prefetch (%eax)
+; 3DNOW-NEXT:    prefetch (%eax)
+; 3DNOW-NEXT:    prefetch (%eax)
+; 3DNOW-NEXT:    prefetch (%eax)
+; 3DNOW-NEXT:    retl
+;
+; PRFCHW3DNOW-LABEL: t:
+; PRFCHW3DNOW:       # %bb.0: # %entry
+; PRFCHW3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; PRFCHW3DNOW-NEXT:    prefetch (%eax)
+; PRFCHW3DNOW-NEXT:    prefetch (%eax)
+; PRFCHW3DNOW-NEXT:    prefetch (%eax)
+; PRFCHW3DNOW-NEXT:    prefetch (%eax)
+; PRFCHW3DNOW-NEXT:    prefetchw (%eax)
+; PRFCHW3DNOW-NEXT:    prefetchw (%eax)
+; PRFCHW3DNOW-NEXT:    prefetchw (%eax)
+; PRFCHW3DNOW-NEXT:    prefetchw (%eax)
+; PRFCHW3DNOW-NEXT:    retl
 entry:
-; CHECK: prefetcht2
-; CHECK: prefetcht1
-; CHECK: prefetcht0
-; CHECK: prefetchnta
-; PRFCHW: prefetchw
-; NOPRFCHW-NOT: prefetchw
-; SLM: prefetchw
 	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 1, i32 1 )
 	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 2, i32 1 )
 	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
 	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 0, i32 1 )
+	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 1, i32 1 )
+	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 2, i32 1 )
 	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 3, i32 1 )
+	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 0, i32 1 )
 	ret void
 }
 
-declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind 
+declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
index 0f1f818e250d..2df115e2f9e4 100644
--- a/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
@@ -2,7 +2,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
@@ -363,12 +364,26 @@ define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind {
 ; SSE-NEXT:    movd %xmm0, (%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_to_v2i16_1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1:
 ; AVX512F:       # %bb.0:
@@ -409,12 +424,26 @@ define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind {
 ; SSE-NEXT:    movd %xmm0, (%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_to_v2i16_2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2:
 ; AVX512F:       # %bb.0:
@@ -455,12 +484,26 @@ define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind {
 ; SSE-NEXT:    movd %xmm0, (%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_to_v2i16_3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3:
 ; AVX512F:       # %bb.0:
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
index 7cef269ebc2b..081c962ab94a 100644
--- a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
@@ -362,18 +363,30 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_to_v4i16_1:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
 ; AVX512F:       # %bb.0:
@@ -446,18 +459,30 @@ define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_to_v4i16_2:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
 ; AVX512F:       # %bb.0:
@@ -522,18 +547,30 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_to_v4i16_3:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
 ; AVX512F:       # %bb.0:
@@ -633,11 +670,9 @@ define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
 ; AVX512VL-NEXT:    vzeroupper
@@ -659,11 +694,9 @@ define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpsrld $16, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
@@ -795,11 +828,9 @@ define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
 ; AVX512VL-NEXT:    vzeroupper
@@ -821,11 +852,9 @@ define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
@@ -949,14 +978,9 @@ define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
 ; AVX512VL-NEXT:    vzeroupper
@@ -978,14 +1002,9 @@ define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
index 7f3431fabedc..a5d0a7fa401b 100644
--- a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
@@ -99,10 +99,9 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
 ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
-; AVX512BWVL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
@@ -673,17 +672,14 @@ define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
@@ -831,17 +827,14 @@ define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
@@ -989,24 +982,14 @@ define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
@@ -1154,17 +1137,14 @@ define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,14,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index 1bfe37b1497e..f2cd81d3d937 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -2,7 +2,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
@@ -436,12 +437,26 @@ define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
 ; SSE-NEXT:    movd %xmm0, (%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_to_v2i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_to_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16:
 ; AVX512F:       # %bb.0:
@@ -482,12 +497,26 @@ define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
 ; SSE-NEXT:    movd %xmm0, (%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: trunc_v2i64_to_v2i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
+; AVX1-LABEL: trunc_v2i64_to_v2i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT:    vmovd %xmm0, (%rsi)
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_v2i64_to_v2i16:
 ; AVX512F:       # %bb.0:
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 59a8aa47246c..5b46c73a3f65 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
@@ -139,59 +140,17 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   store <8 x i16> %strided.vec, <8 x i16>* %S
@@ -290,13 +249,21 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: trunc_v4i64_to_v4i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermps (%rdi), %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_v4i64_to_v4i32:
 ; AVX512F:       # %bb.0:
@@ -399,12 +366,9 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
@@ -490,18 +454,30 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_to_v4i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512F:       # %bb.0:
@@ -563,14 +539,23 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: trunc_v4i64_to_v4i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_v4i64_to_v4i16:
 ; AVX512F:       # %bb.0:
@@ -693,14 +678,23 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: trunc_v4i64_to_v4i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_v4i64_to_v4i8:
 ; AVX512F:       # %bb.0:
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 3fa148405f6b..4577cd8f8b44 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -148,10 +148,9 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
 ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
-; AVX512BWVL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll
index 82a790298f23..3baab2476d40 100644
--- a/test/CodeGen/X86/var-permute-256.ll
+++ b/test/CodeGen/X86/var-permute-256.ll
@@ -142,14 +142,13 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
 ; AVX1-NEXT:    andl $7, %r10d
 ; AVX1-NEXT:    andl $28, %edi
 ; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT:    movq %rsp, %rax
-; AVX1-NEXT:    vpinsrd $1, (%rdx,%rax), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrd $1, (%rsp,%rdx), %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrd $3, (%rdi,%rax), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrd $3, (%rsp,%rdi), %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vpinsrd $1, (%rsi,%rax), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrd $1, (%rsp,%rsi), %xmm1, %xmm1
 ; AVX1-NEXT:    vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
-; AVX1-NEXT:    vpinsrd $3, (%rcx,%rax), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrd $3, (%rsp,%rcx), %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    movq %rbp, %rsp
 ; AVX1-NEXT:    popq %rbp
@@ -505,118 +504,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
 ; AVX1-NEXT:    vpextrb $0, %xmm2, %eax
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movq %rsp, %rcx
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm0
 ; AVX1-NEXT:    vpextrb $1, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $2, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $3, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $4, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $5, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $6, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $7, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $8, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $9, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $10, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $11, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $12, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $13, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $14, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $15, %xmm2, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm2
 ; AVX1-NEXT:    vpextrb $1, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $2, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $3, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $4, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $5, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $6, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $7, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $8, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $9, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $10, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $11, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $12, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $13, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $14, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT:    vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
 ; AVX1-NEXT:    vpextrb $15, %xmm1, %eax
 ; AVX1-NEXT:    andl $31, %eax
-; AVX1-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX1-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX1-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    movq %rbp, %rsp
@@ -633,118 +631,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
 ; AVX2-NEXT:    vpextrb $0, %xmm2, %eax
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movq %rsp, %rcx
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm0
 ; AVX2-NEXT:    vpextrb $1, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $2, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $3, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $4, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $5, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $6, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $7, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $8, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $9, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $10, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $11, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $12, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $13, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $14, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $15, %xmm2, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm2
 ; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $2, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $3, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $4, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $5, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $6, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $7, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $9, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $10, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $11, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $13, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $14, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT:    vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
 ; AVX2-NEXT:    vpextrb $15, %xmm1, %eax
 ; AVX2-NEXT:    andl $31, %eax
-; AVX2-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX2-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    movq %rbp, %rsp
@@ -761,118 +758,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
 ; AVX512F-NEXT:    vpextrb $0, %xmm2, %eax
 ; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movq %rsp, %rcx
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vpextrb $1, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $2, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $3, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $4, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $5, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $6, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $7, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $8, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $9, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $10, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $11, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $12, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $13, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $14, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $15, %xmm2, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpextrb $0, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vmovd %eax, %xmm2
 ; AVX512F-NEXT:    vpextrb $1, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $2, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $3, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $4, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $5, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $6, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $7, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $8, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $9, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $10, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $11, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $12, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $13, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $14, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT:    vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpextrb $15, %xmm1, %eax
 ; AVX512F-NEXT:    andl $31, %eax
-; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    movq %rbp, %rsp
@@ -889,118 +885,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
 ; AVX512VL-NEXT:    vpextrb $0, %xmm2, %eax
 ; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movq %rsp, %rcx
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vmovd %eax, %xmm0
 ; AVX512VL-NEXT:    vpextrb $1, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $2, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $3, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $4, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $5, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $6, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $7, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $8, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $9, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $10, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $11, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $12, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $13, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $14, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $15, %xmm2, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpextrb $0, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vmovd %eax, %xmm2
 ; AVX512VL-NEXT:    vpextrb $1, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $2, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $3, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $4, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $5, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $6, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $7, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $8, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $9, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $10, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $11, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $12, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $13, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $14, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpextrb $15, %xmm1, %eax
 ; AVX512VL-NEXT:    andl $31, %eax
-; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax
 ; AVX512VL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
 ; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    movq %rbp, %rsp
@@ -1240,7 +1235,6 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
 ; AVX1-NEXT:    andl $7, %r10d
 ; AVX1-NEXT:    andl $28, %edi
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT:    movq %rsp, %rax
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
diff --git a/test/CodeGen/X86/var-permute-512.ll b/test/CodeGen/X86/var-permute-512.ll
index a5aa73cdf1a2..3f9f96b008c5 100644
--- a/test/CodeGen/X86/var-permute-512.ll
+++ b/test/CodeGen/X86/var-permute-512.ll
@@ -511,265 +511,201 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
 ; NOBW-NEXT:    vmovaps %ymm0, (%rsp)
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    movzbl (%rax,%rcx), %eax
-; NOBW-NEXT:    vpextrb $1, %xmm4, %ecx
-; NOBW-NEXT:    andl $63, %ecx
+; NOBW-NEXT:    movzbl 3008(%rsp,%rax), %eax
 ; NOBW-NEXT:    vmovd %eax, %xmm0
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; NOBW-NEXT:    vpinsrb $1, (%rcx,%rax), %xmm0, %xmm0
+; NOBW-NEXT:    vpextrb $1, %xmm4, %eax
+; NOBW-NEXT:    andl $63, %eax
+; NOBW-NEXT:    vpinsrb $1, 2944(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $2, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $2, 2880(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $3, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $3, 2816(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $4, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $4, 2752(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $5, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $5, 2688(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $6, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $6, 2624(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $7, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $7, 2560(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $8, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $8, 2496(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $9, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $9, 2432(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $10, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $10, 2368(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $11, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $11, 2304(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $12, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $12, 2240(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $13, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $13, 2176(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $14, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $14, 2112(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $15, %xmm4, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT:    vpinsrb $15, 2048(%rsp,%rax), %xmm0, %xmm0
 ; NOBW-NEXT:    vpextrb $0, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    movzbl (%rax,%rcx), %eax
-; NOBW-NEXT:    vpextrb $1, %xmm2, %ecx
-; NOBW-NEXT:    andl $63, %ecx
+; NOBW-NEXT:    movzbl 4032(%rsp,%rax), %eax
 ; NOBW-NEXT:    vmovd %eax, %xmm1
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; NOBW-NEXT:    vpinsrb $1, (%rcx,%rax), %xmm1, %xmm1
+; NOBW-NEXT:    vpextrb $1, %xmm2, %eax
+; NOBW-NEXT:    andl $63, %eax
+; NOBW-NEXT:    vpinsrb $1, 3968(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $2, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $2, 3904(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $3, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $3, 3840(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $4, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $4, 3776(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $5, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $5, 3712(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $6, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $6, 3648(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $7, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $7, 3584(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $8, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $8, 3520(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $9, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $9, 3456(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $10, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $10, 3392(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $11, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $11, 3328(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $12, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $12, 3264(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $13, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $13, 3200(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $14, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $14, 3136(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $15, %xmm2, %eax
 ; NOBW-NEXT:    vextracti128 $1, %ymm3, %xmm2
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT:    vpinsrb $15, 3072(%rsp,%rax), %xmm1, %xmm1
 ; NOBW-NEXT:    vpextrb $0, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    movzbl (%rax,%rcx), %eax
-; NOBW-NEXT:    vpextrb $1, %xmm2, %ecx
-; NOBW-NEXT:    andl $63, %ecx
+; NOBW-NEXT:    movzbl 960(%rsp,%rax), %eax
 ; NOBW-NEXT:    vmovd %eax, %xmm4
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; NOBW-NEXT:    vpinsrb $1, (%rcx,%rax), %xmm4, %xmm4
+; NOBW-NEXT:    vpextrb $1, %xmm2, %eax
+; NOBW-NEXT:    andl $63, %eax
+; NOBW-NEXT:    vpinsrb $1, 896(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $2, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $2, 832(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $3, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $3, 768(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $4, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $4, 704(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $5, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $5, 640(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $6, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $6, 576(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $7, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $7, 512(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $8, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $8, 448(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $9, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $9, 384(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $10, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $10, 320(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $11, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $11, 256(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $12, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $12, 192(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $13, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $13, 128(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $14, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $14, 64(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $15, %xmm2, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    movq %rsp, %rcx
-; NOBW-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm4, %xmm2
+; NOBW-NEXT:    vpinsrb $15, (%rsp,%rax), %xmm4, %xmm2
 ; NOBW-NEXT:    vpextrb $0, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    movzbl (%rax,%rcx), %eax
-; NOBW-NEXT:    vpextrb $1, %xmm3, %ecx
-; NOBW-NEXT:    andl $63, %ecx
+; NOBW-NEXT:    movzbl 1984(%rsp,%rax), %eax
 ; NOBW-NEXT:    vmovd %eax, %xmm4
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; NOBW-NEXT:    vpinsrb $1, (%rcx,%rax), %xmm4, %xmm4
+; NOBW-NEXT:    vpextrb $1, %xmm3, %eax
+; NOBW-NEXT:    andl $63, %eax
+; NOBW-NEXT:    vpinsrb $1, 1920(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $2, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $2, 1856(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $3, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $3, 1792(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $4, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $4, 1728(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $5, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $5, 1664(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $6, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $6, 1600(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $7, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $7, 1536(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $8, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $8, 1472(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $9, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $9, 1408(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $10, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $10, 1344(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $11, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $11, 1280(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $12, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $12, 1216(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $13, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $13, 1152(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $14, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT:    vpinsrb $14, 1088(%rsp,%rax), %xmm4, %xmm4
 ; NOBW-NEXT:    vpextrb $15, %xmm3, %eax
 ; NOBW-NEXT:    andl $63, %eax
-; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; NOBW-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm4, %xmm3
+; NOBW-NEXT:    vpinsrb $15, 1024(%rsp,%rax), %xmm4, %xmm3
 ; NOBW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm1
 ; NOBW-NEXT:    movq %rbp, %rsp
diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll
index 12530adf15cb..2178eb70cdec 100644
--- a/test/CodeGen/X86/vector-compare-results.ll
+++ b/test/CodeGen/X86/vector-compare-results.ll
@@ -2384,11 +2384,10 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
 ; AVX512F-LABEL: test_cmp_v32f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vcmpltps %zmm0, %zmm2, %k1
-; AVX512F-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vcmpltps %zmm1, %zmm3, %k1
-; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
@@ -2399,12 +2398,11 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
 ;
 ; AVX512DQ-LABEL: test_cmp_v32f32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm2, %k1
-; AVX512DQ-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm2, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vcmpltps %zmm1, %zmm3, %k1
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vcmpltps %zmm1, %zmm3, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
@@ -2893,11 +2891,10 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
 ; AVX512F-LABEL: test_cmp_v32i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtd %zmm2, %zmm0, %k1
-; AVX512F-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpcmpgtd %zmm3, %zmm1, %k1
-; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
@@ -2908,12 +2905,11 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
 ;
 ; AVX512DQ-LABEL: test_cmp_v32i32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpcmpgtd %zmm2, %zmm0, %k1
-; AVX512DQ-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpcmpgtd %zmm2, %zmm0, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpcmpgtd %zmm3, %zmm1, %k1
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpcmpgtd %zmm3, %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
@@ -5965,13 +5961,12 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
 ; AVX512F-NEXT:    vcmpltpd %zmm0, %zmm4, %k0
 ; AVX512F-NEXT:    vcmpltpd %zmm1, %zmm5, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vcmpltpd %zmm2, %zmm6, %k0
 ; AVX512F-NEXT:    vcmpltpd %zmm3, %zmm7, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
@@ -5984,14 +5979,13 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm4, %k0
 ; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm5, %k1
-; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512DQ-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vcmpltpd %zmm2, %zmm6, %k0
 ; AVX512DQ-NEXT:    vcmpltpd %zmm3, %zmm7, %k1
-; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
@@ -6588,13 +6582,12 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
 ; AVX512F-NEXT:    vpcmpgtq %zmm4, %zmm0, %k0
 ; AVX512F-NEXT:    vpcmpgtq %zmm5, %zmm1, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpcmpgtq %zmm6, %zmm2, %k0
 ; AVX512F-NEXT:    vpcmpgtq %zmm7, %zmm3, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
@@ -6607,14 +6600,13 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpcmpgtq %zmm4, %zmm0, %k0
 ; AVX512DQ-NEXT:    vpcmpgtq %zmm5, %zmm1, %k1
-; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512DQ-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpcmpgtq %zmm6, %zmm2, %k0
 ; AVX512DQ-NEXT:    vpcmpgtq %zmm7, %zmm3, %k1
-; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
index 44fe38fa86b9..4da23e539f6a 100644
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -2195,8 +2195,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
 ; AVX512VL-NEXT:    shlq $32, %rdx
 ; AVX512VL-NEXT:    orq %rcx, %rdx
 ; AVX512VL-NEXT:    vmovq %rdx, %xmm0
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VL-NEXT:    retq
   %1 = fptrunc <4 x float> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
@@ -2205,108 +2204,30 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
 }
 
 define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
-; AVX1-LABEL: cvt_4f32_to_8i16_zero:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT:    vmovd %xmm1, %eax
-; AVX1-NEXT:    shll $16, %eax
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT:    vmovd %xmm1, %ecx
-; AVX1-NEXT:    movzwl %cx, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT:    vmovd %xmm1, %eax
-; AVX1-NEXT:    shll $16, %eax
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %edx
-; AVX1-NEXT:    movzwl %dx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    shlq $32, %rdx
-; AVX1-NEXT:    orq %rcx, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: cvt_4f32_to_8i16_zero:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT:    vmovd %xmm1, %eax
-; AVX2-NEXT:    shll $16, %eax
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT:    vmovd %xmm1, %ecx
-; AVX2-NEXT:    movzwl %cx, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT:    vmovd %xmm1, %eax
-; AVX2-NEXT:    shll $16, %eax
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %edx
-; AVX2-NEXT:    movzwl %dx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    shlq $32, %rdx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovd %xmm1, %eax
-; AVX512F-NEXT:    shll $16, %eax
-; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; AVX512F-NEXT:    vmovd %xmm1, %ecx
-; AVX512F-NEXT:    movzwl %cx, %ecx
-; AVX512F-NEXT:    orl %eax, %ecx
-; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovd %xmm1, %eax
-; AVX512F-NEXT:    shll $16, %eax
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovd %xmm0, %edx
-; AVX512F-NEXT:    movzwl %dx, %edx
-; AVX512F-NEXT:    orl %eax, %edx
-; AVX512F-NEXT:    shlq $32, %rdx
-; AVX512F-NEXT:    orq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovd %xmm1, %eax
-; AVX512VL-NEXT:    shll $16, %eax
-; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT:    vmovd %xmm1, %ecx
-; AVX512VL-NEXT:    movzwl %cx, %ecx
-; AVX512VL-NEXT:    orl %eax, %ecx
-; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovd %xmm1, %eax
-; AVX512VL-NEXT:    shll $16, %eax
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovd %xmm0, %edx
-; AVX512VL-NEXT:    movzwl %dx, %edx
-; AVX512VL-NEXT:    orl %eax, %edx
-; AVX512VL-NEXT:    shlq $32, %rdx
-; AVX512VL-NEXT:    orq %rcx, %rdx
-; AVX512VL-NEXT:    vmovq %rdx, %xmm0
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: cvt_4f32_to_8i16_zero:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT:    vmovd %xmm1, %eax
+; ALL-NEXT:    shll $16, %eax
+; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT:    vmovd %xmm1, %ecx
+; ALL-NEXT:    movzwl %cx, %ecx
+; ALL-NEXT:    orl %eax, %ecx
+; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT:    vmovd %xmm1, %eax
+; ALL-NEXT:    shll $16, %eax
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT:    vmovd %xmm0, %edx
+; ALL-NEXT:    movzwl %dx, %edx
+; ALL-NEXT:    orl %eax, %edx
+; ALL-NEXT:    shlq $32, %rdx
+; ALL-NEXT:    orq %rcx, %rdx
+; ALL-NEXT:    vmovq %rdx, %xmm0
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT:    retq
   %1 = fptrunc <4 x float> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2715,8 +2636,7 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
 ; AVX512VL-NEXT:    shlq $32, %rdx
 ; AVX512VL-NEXT:    orq %rcx, %rdx
 ; AVX512VL-NEXT:    vmovq %rdx, %xmm0
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rdi)
 ; AVX512VL-NEXT:    retq
   %1 = fptrunc <4 x float> %a0 to <4 x half>
@@ -2727,112 +2647,31 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
 }
 
 define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind {
-; AVX1-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT:    vmovd %xmm1, %eax
-; AVX1-NEXT:    shll $16, %eax
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT:    vmovd %xmm1, %ecx
-; AVX1-NEXT:    movzwl %cx, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT:    vmovd %xmm1, %eax
-; AVX1-NEXT:    shll $16, %eax
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %edx
-; AVX1-NEXT:    movzwl %dx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    shlq $32, %rdx
-; AVX1-NEXT:    orq %rcx, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT:    vmovd %xmm1, %eax
-; AVX2-NEXT:    shll $16, %eax
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT:    vmovd %xmm1, %ecx
-; AVX2-NEXT:    movzwl %cx, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT:    vmovd %xmm1, %eax
-; AVX2-NEXT:    shll $16, %eax
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %edx
-; AVX2-NEXT:    movzwl %dx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    shlq $32, %rdx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovd %xmm1, %eax
-; AVX512F-NEXT:    shll $16, %eax
-; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; AVX512F-NEXT:    vmovd %xmm1, %ecx
-; AVX512F-NEXT:    movzwl %cx, %ecx
-; AVX512F-NEXT:    orl %eax, %ecx
-; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovd %xmm1, %eax
-; AVX512F-NEXT:    shll $16, %eax
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovd %xmm0, %edx
-; AVX512F-NEXT:    movzwl %dx, %edx
-; AVX512F-NEXT:    orl %eax, %edx
-; AVX512F-NEXT:    shlq $32, %rdx
-; AVX512F-NEXT:    orq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovd %xmm1, %eax
-; AVX512VL-NEXT:    shll $16, %eax
-; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT:    vmovd %xmm1, %ecx
-; AVX512VL-NEXT:    movzwl %cx, %ecx
-; AVX512VL-NEXT:    orl %eax, %ecx
-; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovd %xmm1, %eax
-; AVX512VL-NEXT:    shll $16, %eax
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovd %xmm0, %edx
-; AVX512VL-NEXT:    movzwl %dx, %edx
-; AVX512VL-NEXT:    orl %eax, %edx
-; AVX512VL-NEXT:    shlq $32, %rdx
-; AVX512VL-NEXT:    orq %rcx, %rdx
-; AVX512VL-NEXT:    vmovq %rdx, %xmm0
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: store_cvt_4f32_to_8i16_zero:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT:    vmovd %xmm1, %eax
+; ALL-NEXT:    shll $16, %eax
+; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT:    vmovd %xmm1, %ecx
+; ALL-NEXT:    movzwl %cx, %ecx
+; ALL-NEXT:    orl %eax, %ecx
+; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT:    vmovd %xmm1, %eax
+; ALL-NEXT:    shll $16, %eax
+; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT:    vmovd %xmm0, %edx
+; ALL-NEXT:    movzwl %dx, %edx
+; ALL-NEXT:    orl %eax, %edx
+; ALL-NEXT:    shlq $32, %rdx
+; ALL-NEXT:    orq %rcx, %rdx
+; ALL-NEXT:    vmovq %rdx, %xmm0
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT:    vmovdqa %xmm0, (%rdi)
+; ALL-NEXT:    retq
   %1 = fptrunc <4 x float> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -3389,8 +3228,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    shlq $32, %rax
 ; AVX512VL-NEXT:    orq %r14, %rax
 ; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VL-NEXT:    addq $40, %rsp
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r14
@@ -3478,84 +3316,43 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: cvt_4f64_to_8i16_zero:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    subq $40, %rsp
-; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    callq __truncdfhf2
-; AVX512F-NEXT:    movl %eax, %ebx
-; AVX512F-NEXT:    shll $16, %ebx
-; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    callq __truncdfhf2
-; AVX512F-NEXT:    movzwl %ax, %r14d
-; AVX512F-NEXT:    orl %ebx, %r14d
-; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    callq __truncdfhf2
-; AVX512F-NEXT:    movl %eax, %ebx
-; AVX512F-NEXT:    shll $16, %ebx
-; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512F-NEXT:    callq __truncdfhf2
-; AVX512F-NEXT:    movzwl %ax, %eax
-; AVX512F-NEXT:    orl %ebx, %eax
-; AVX512F-NEXT:    shlq $32, %rax
-; AVX512F-NEXT:    orq %r14, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    addq $40, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: cvt_4f64_to_8i16_zero:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    subq $40, %rsp
-; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    callq __truncdfhf2
-; AVX512VL-NEXT:    movl %eax, %ebx
-; AVX512VL-NEXT:    shll $16, %ebx
-; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    callq __truncdfhf2
-; AVX512VL-NEXT:    movzwl %ax, %r14d
-; AVX512VL-NEXT:    orl %ebx, %r14d
-; AVX512VL-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    callq __truncdfhf2
-; AVX512VL-NEXT:    movl %eax, %ebx
-; AVX512VL-NEXT:    shll $16, %ebx
-; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT:    callq __truncdfhf2
-; AVX512VL-NEXT:    movzwl %ax, %eax
-; AVX512VL-NEXT:    orl %ebx, %eax
-; AVX512VL-NEXT:    shlq $32, %rax
-; AVX512VL-NEXT:    orq %r14, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    addq $40, %rsp
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    retq
+; AVX512-LABEL: cvt_4f64_to_8i16_zero:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $40, %rsp
+; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2
+; AVX512-NEXT:    movl %eax, %ebx
+; AVX512-NEXT:    shll $16, %ebx
+; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2
+; AVX512-NEXT:    movzwl %ax, %r14d
+; AVX512-NEXT:    orl %ebx, %r14d
+; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2
+; AVX512-NEXT:    movl %eax, %ebx
+; AVX512-NEXT:    shll $16, %ebx
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __truncdfhf2
+; AVX512-NEXT:    movzwl %ax, %eax
+; AVX512-NEXT:    orl %ebx, %eax
+; AVX512-NEXT:    shlq $32, %rax
+; AVX512-NEXT:    orq %r14, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    addq $40, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
   %1 = fptrunc <4 x double> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -4095,8 +3892,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 ; AVX512VL-NEXT:    shlq $32, %rax
 ; AVX512VL-NEXT:    orq %rbx, %rax
 ; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%r14)
 ; AVX512VL-NEXT:    addq $32, %rsp
 ; AVX512VL-NEXT:    popq %rbx
@@ -4195,92 +3991,47 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    subq $32, %rsp
-; AVX512F-NEXT:    movq %rdi, %r14
-; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    callq __truncdfhf2
-; AVX512F-NEXT:    movl %eax, %ebp
-; AVX512F-NEXT:    shll $16, %ebp
-; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    callq __truncdfhf2
-; AVX512F-NEXT:    movzwl %ax, %ebx
-; AVX512F-NEXT:    orl %ebp, %ebx
-; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    callq __truncdfhf2
-; AVX512F-NEXT:    movl %eax, %ebp
-; AVX512F-NEXT:    shll $16, %ebp
-; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512F-NEXT:    callq __truncdfhf2
-; AVX512F-NEXT:    movzwl %ax, %eax
-; AVX512F-NEXT:    orl %ebp, %eax
-; AVX512F-NEXT:    shlq $32, %rax
-; AVX512F-NEXT:    orq %rbx, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vmovdqa %xmm0, (%r14)
-; AVX512F-NEXT:    addq $32, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %rbp
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rbp
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    subq $32, %rsp
-; AVX512VL-NEXT:    movq %rdi, %r14
-; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    callq __truncdfhf2
-; AVX512VL-NEXT:    movl %eax, %ebp
-; AVX512VL-NEXT:    shll $16, %ebp
-; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    callq __truncdfhf2
-; AVX512VL-NEXT:    movzwl %ax, %ebx
-; AVX512VL-NEXT:    orl %ebp, %ebx
-; AVX512VL-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    callq __truncdfhf2
-; AVX512VL-NEXT:    movl %eax, %ebp
-; AVX512VL-NEXT:    shll $16, %ebp
-; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT:    callq __truncdfhf2
-; AVX512VL-NEXT:    movzwl %ax, %eax
-; AVX512VL-NEXT:    orl %ebp, %eax
-; AVX512VL-NEXT:    shlq $32, %rax
-; AVX512VL-NEXT:    orq %rbx, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%r14)
-; AVX512VL-NEXT:    addq $32, %rsp
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %rbp
-; AVX512VL-NEXT:    retq
+; AVX512-LABEL: store_cvt_4f64_to_8i16_zero:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $32, %rsp
+; AVX512-NEXT:    movq %rdi, %r14
+; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2
+; AVX512-NEXT:    movl %eax, %ebp
+; AVX512-NEXT:    shll $16, %ebp
+; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2
+; AVX512-NEXT:    movzwl %ax, %ebx
+; AVX512-NEXT:    orl %ebp, %ebx
+; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq __truncdfhf2
+; AVX512-NEXT:    movl %eax, %ebp
+; AVX512-NEXT:    shll $16, %ebp
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __truncdfhf2
+; AVX512-NEXT:    movzwl %ax, %eax
+; AVX512-NEXT:    orl %ebp, %eax
+; AVX512-NEXT:    shlq $32, %rax
+; AVX512-NEXT:    orq %rbx, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vmovdqa %xmm0, (%r14)
+; AVX512-NEXT:    addq $32, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
   %1 = fptrunc <4 x double> %a0 to <4 x half>
   %2 = bitcast <4 x half> %1 to <4 x i16>
   %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index b40c9eddd46b..8af96c168be6 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -699,20 +699,35 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: var_rotate_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vpsllvd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: var_rotate_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpmovwb %ymm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VL-NEXT:    vpsrlvw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: var_rotate_v16i8:
 ; XOP:       # %bb.0:
@@ -1249,16 +1264,29 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: constant_rotate_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm1
-; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: constant_rotate_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v16i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VL-NEXT:    vpmovwb %ymm1, %xmm1
+; AVX512VL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: constant_rotate_v16i8:
 ; XOP:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index ea33f22cc07a..ca670f40ab3f 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -531,23 +531,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
 ;
-; AVX512VL-LABEL: var_shift_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512VL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX512BW-LABEL: var_shift_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v16i8:
 ; X32-SSE:       # %bb.0:
@@ -948,25 +967,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: splatvar_shift_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
 ;
-; AVX512VL-LABEL: splatvar_shift_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX512BW-LABEL: splatvar_shift_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
 ; X32-SSE:       # %bb.0:
@@ -1441,21 +1481,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
 ;
-; AVX512VL-LABEL: constant_shift_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX512BW-LABEL: constant_shift_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v16i8:
 ; X32-SSE:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 307cf287219d..890cedf97c9d 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -451,23 +451,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
 ;
-; AVX512VL-LABEL: var_shift_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX512BW-LABEL: var_shift_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v16i8:
 ; X32-SSE:       # %bb.0:
@@ -753,25 +772,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: splatvar_shift_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
 ;
-; AVX512VL-LABEL: splatvar_shift_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX512BW-LABEL: splatvar_shift_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
 ; X32-SSE:       # %bb.0:
@@ -1148,21 +1188,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
 ;
-; AVX512VL-LABEL: constant_shift_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX512BW-LABEL: constant_shift_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v16i8:
 ; X32-SSE:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index b518ad5fcffd..9481e46c0c52 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -401,23 +401,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
 ;
-; AVX512VL-LABEL: var_shift_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX512BW-LABEL: var_shift_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v16i8:
 ; X32-SSE:       # %bb.0:
@@ -695,25 +714,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: splatvar_shift_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
 ;
-; AVX512VL-LABEL: splatvar_shift_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX512BW-LABEL: splatvar_shift_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
 ; X32-SSE:       # %bb.0:
@@ -998,21 +1038,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v16i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
 ;
-; AVX512VL-LABEL: constant_shift_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX512BW-LABEL: constant_shift_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i8:
+; AVX512DQVL:       # %bb.0:
+; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT:    vzeroupper
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v16i8:
 ; X32-SSE:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 2fcbd89b857e..2f5a2b116115 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -3,8 +3,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@@ -58,17 +60,10 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <16 x i8> %shuffle
 }
@@ -93,17 +88,10 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i8> %shuffle
 }
@@ -115,11 +103,27 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
   ret <16 x i8> %shuffle
 }
@@ -131,11 +135,27 @@ define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
   ret <16 x i8> %shuffle
 }
@@ -1203,12 +1223,25 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
 ; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1OR2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX1OR2:       # %bb.0: # %entry
+; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; AVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX512VLBW:       # %bb.0: # %entry
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX512VLVBMI:       # %bb.0: # %entry
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0>
+; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT:    retq
 entry:
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
 
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 60bc36948d23..072d71fae570 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -3,8 +3,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
 
 define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-LABEL: shuffle_v8i16_01012323:
@@ -85,11 +87,33 @@ define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_00004444:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_00004444:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_00004444:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_00004444:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_00004444:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_00004444:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
@@ -126,11 +150,33 @@ define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_31206745:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_31206745:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_31206745:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_31206745:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_31206745:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_31206745:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5>
   ret <8 x i16> %shuffle
 }
@@ -179,11 +225,33 @@ define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_23026745:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_23026745:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_23026745:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_23026745:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_23026745:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_23026745:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 5>
   ret <8 x i16> %shuffle
 }
@@ -194,11 +262,33 @@ define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_23016747:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_23016747:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_23016747:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_23016747:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_23016747:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_23016747:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 7>
   ret <8 x i16> %shuffle
 }
@@ -597,11 +687,33 @@ define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_04404567:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_04404567:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_04404567:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_04404567:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_04404567:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_04404567:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %shuffle
 }
@@ -700,17 +812,10 @@ define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_0127XXXX:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_0127XXXX:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_0127XXXX:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
@@ -733,17 +838,10 @@ define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_XXXX4563:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_XXXX4563:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_XXXX4563:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3>
   ret <8 x i16> %shuffle
 }
@@ -766,17 +864,10 @@ define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_4563XXXX:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_4563XXXX:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_4563XXXX:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
@@ -799,17 +890,10 @@ define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_01274563:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_01274563:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_01274563:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3>
   ret <8 x i16> %shuffle
 }
@@ -832,17 +916,10 @@ define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_45630127:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_45630127:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_45630127:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7>
   ret <8 x i16> %shuffle
 }
@@ -980,12 +1057,38 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_109832ba:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_109832ba:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_109832ba:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_109832ba:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_109832ba:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_109832ba:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
   ret <8 x i16> %shuffle
 }
@@ -1028,13 +1131,43 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_0213cedf:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_0213cedf:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_0213cedf:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_0213cedf:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15]
+; AVX512VL-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX512VL-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>
   ret <8 x i16> %shuffle
 }
@@ -1064,12 +1197,38 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_v8i16_443aXXXX:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuffle_v8i16_443aXXXX:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i16_443aXXXX:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_443aXXXX:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_443aXXXX:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_443aXXXX:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
@@ -1336,13 +1495,35 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i16_XXX1X579:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpbroadcastd %xmm1, %xmm1
-; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
-; AVX2OR512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8i16_XXX1X579:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i16_XXX1X579:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15]
+; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXX1X579:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; AVX512VL-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_XXX1X579:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15]
+; AVX512VL-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
   ret <8 x i16> %shuffle
 }
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 11f25a2d687d..cbd1b83a4eb2 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
 
 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@@ -156,15 +158,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
 ; AVX512VL:       # %bb.0:
@@ -214,12 +225,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VL:       # %bb.0:
@@ -241,12 +259,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VL:       # %bb.0:
@@ -373,11 +398,27 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
 }
@@ -393,11 +434,27 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
   ret <16 x i16> %shuffle
 }
@@ -413,11 +470,27 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
   ret <16 x i16> %shuffle
 }
@@ -433,11 +506,27 @@ define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
-; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
+; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15>
   ret <16 x i16> %shuffle
 }
@@ -453,11 +542,27 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15]
-; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15]
+; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   ret <16 x i16> %shuffle
 }
@@ -473,11 +578,27 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15]
-; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15]
+; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
   ret <16 x i16> %shuffle
 }
@@ -789,13 +910,20 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
 ; AVX512VL:       # %bb.0:
@@ -850,12 +978,18 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15,12,13,10,11,8,9,22,23,20,21,18,19,16,17,30,31,28,29,26,27,24,25]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
 ; AVX512VL:       # %bb.0:
@@ -880,13 +1014,20 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17]
+; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
 ; AVX512VL:       # %bb.0:
@@ -1349,12 +1490,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
 ; AVX512VL:       # %bb.0:
@@ -1376,12 +1523,18 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
 ; AVX512VL:       # %bb.0:
@@ -1404,12 +1557,18 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
 ; AVX512VL:       # %bb.0:
@@ -1431,12 +1590,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
 ; AVX512VL:       # %bb.0:
@@ -1877,15 +2042,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
 ; AVX512VL:       # %bb.0:
@@ -1909,15 +2083,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpsllq $48, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT:    vpsllq $48, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
 ; AVX512VL:       # %bb.0:
@@ -1995,14 +2178,22 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11,22,23,18,19,20,21,16,17,28,29,30,31,24,25,26,27]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
 ; AVX512VL:       # %bb.0:
@@ -2081,14 +2272,22 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
+; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
+; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11,20,21,22,23,16,17,20,21,28,29,30,31,24,25,26,27]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
 ; AVX512VL:       # %bb.0:
@@ -2110,14 +2309,22 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15,20,21,22,23,16,17,18,19,28,29,30,31,24,25,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
 ; AVX512VL:       # %bb.0:
@@ -2396,12 +2603,19 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
 ; AVX512VL:       # %bb.0:
@@ -2540,14 +2754,22 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15,16,17,24,25,24,25,16,17,24,25,26,27,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
 ; AVX512VL:       # %bb.0:
@@ -2655,17 +2877,10 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_u
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 10, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <16 x i16> %shuffle
 }
@@ -2705,17 +2920,10 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <16 x i16> %shuffle
 }
@@ -2803,12 +3011,19 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
 ; AVX512VL:       # %bb.0:
@@ -2957,16 +3172,25 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,2,3,8,9,12,13,12,13,14,15,16,17,16,17,20,21,18,19,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31]
+; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
 ; AVX512VL:       # %bb.0:
@@ -2992,16 +3216,26 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
-; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
+; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31]
+; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
 ; AVX512VL:       # %bb.0:
@@ -3123,15 +3357,24 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
 ; AVX512VL:       # %bb.0:
@@ -3156,12 +3399,18 @@ define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_u
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15,24,25,24,25,22,23,20,21,24,25,26,27,28,29,30,31]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
 ; AVX512VL:       # %bb.0:
@@ -3363,14 +3612,22 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15,16,17,18,19,20,21,18,19,24,25,26,27,30,31,30,31]
+; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-FAST-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
 ; AVX512VL:       # %bb.0:
@@ -3783,12 +4040,31 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a,
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2OR512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX512VL-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   ret <16 x i16> %shuffle
 }
@@ -3821,11 +4097,33 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16>
 }
 
 define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
-; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <16 x i16> %shuffle
 }
@@ -3838,12 +4136,31 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <16 x i16> %shuffle
 }
@@ -3906,9 +4223,8 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
-; AVX512VL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,18,3,19,10,26,11,27,0,16,1,17,8,24,9,25]
+; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
   %2 = bitcast <16 x i16> %1 to <4 x i64>
@@ -4006,21 +4322,36 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: PR24935:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
-; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: PR24935:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
+; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15]
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: PR24935:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
+; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,2,3,6,7,10,11,10,11,12,13,14,15,16,17,18,19,18,19,22,23,26,27,26,27,28,29,30,31]
+; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
+; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
+; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: PR24935:
 ; AVX512VL:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 01c7fc466eb8..51ef3a18438f 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1,7 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW --check-prefix=AVX512VLBW-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW --check-prefix=AVX512VLBW-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI --check-prefix=AVX512VLVBMI-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI --check-prefix=AVX512VLVBMI-FAST
 
 define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@@ -312,17 +316,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpbroadcastb %xmm0, %xmm0
-; AVX512VL-NEXT:    movl $32767, %eax # imm = 0x7FFF
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX512VLBW-NEXT:    movl $32767, %eax # imm = 0x7FFF
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -346,14 +356,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    movl $1, %eax
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT:    movl $1, %eax
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -377,14 +393,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    movw $1, %ax
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT:    movw $1, %ax
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -408,14 +430,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    movw $1, %ax
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT:    movw $1, %ax
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -431,12 +459,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX512VLBW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -452,12 +507,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX512VLBW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -473,12 +555,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX512VLBW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -494,12 +603,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3]
+; AVX512VLBW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -522,12 +658,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -550,12 +698,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -578,12 +738,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -606,12 +778,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -634,12 +818,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -662,12 +858,24 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -690,12 +898,24 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -722,14 +942,29 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512VL-NEXT:    movl $15, %eax
-; AVX512VL-NEXT:    vmovd %eax, %xmm1
-; AVX512VL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512VLBW-SLOW-NEXT:    movl $15, %eax
+; AVX512VLBW-SLOW-NEXT:    vmovd %eax, %xmm1
+; AVX512VLBW-SLOW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
+; AVX512VLBW-FAST-NEXT:    movl $15, %eax
+; AVX512VLBW-FAST-NEXT:    vmovd %eax, %xmm1
+; AVX512VLBW-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    movl $31, %eax
+; AVX512VLVBMI-NEXT:    vmovd %eax, %xmm1
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <32 x i8> %shuffle
 }
@@ -1092,25 +1327,49 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm1, %ymm0 {%k1}
-; AVX512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX512VLBW-SLOW:       # %bb.0:
+; AVX512VLBW-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX512VLBW-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512VLBW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-SLOW-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX512VLBW-SLOW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-SLOW-NEXT:    vpshufb %ymm2, %ymm1, %ymm0 {%k1}
+; AVX512VLBW-SLOW-NEXT:    retq
+;
+; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX512VLBW-FAST:       # %bb.0:
+; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX512VLBW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-FAST-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX512VLBW-FAST-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm0 {%k1}
+; AVX512VLBW-FAST-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,0,32,0,32,0,32,0,32,0,32,0,32,0,32,16,48,16,48,16,48,16,48,16,48,16,48,16,48,16,48]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48>
   ret <32 x i8> %shuffle
 }
@@ -1129,12 +1388,26 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2OR512VL-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,40,41,42,43,44,45,46,47,16,16,16,16,16,16,16,16,56,57,58,59,60,61,62,63]
+; AVX512VLVBMI-NEXT:    vpermi2b %ymm0, %ymm1, %ymm2
+; AVX512VLVBMI-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i8> %shuffle
 }
@@ -1155,11 +1428,24 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,47,46,45,44,43,42,41,40,23,22,21,20,19,18,17,16,63,62,61,60,59,58,57,56]
+; AVX512VLVBMI-NEXT:    vpermi2b %ymm0, %ymm1, %ymm2
+; AVX512VLVBMI-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24>
   ret <32 x i8> %shuffle
 }
@@ -1177,12 +1463,26 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
-; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,39,38,37,36,35,34,33,32,23,22,21,20,19,18,17,16,55,54,53,52,51,50,49,48]
+; AVX512VLVBMI-NEXT:    vpermi2b %ymm0, %ymm1, %ymm2
+; AVX512VLVBMI-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
   ret <32 x i8> %shuffle
 }
@@ -1350,13 +1650,19 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
-; AVX512VL-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
+; AVX512VLBW-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   ret <32 x i8> %shuffle
 }
@@ -1379,13 +1685,19 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
-; AVX512VL-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23]
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
+; AVX512VLBW-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47,16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
   ret <32 x i8> %shuffle
 }
@@ -1634,22 +1946,29 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
-; AVX512VL-NEXT:    movl $-222248896, %eax # imm = 0xF2C0C040
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
-; AVX512VL-NEXT:    movl $134948620, %eax # imm = 0x80B270C
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
+; AVX512VLBW-NEXT:    movl $-222248896, %eax # imm = 0xF2C0C040
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
+; AVX512VLBW-NEXT:    movl $134948620, %eax # imm = 0x80B270C
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [10,13,44,45,3,3,28,8,49,54,61,12,1,44,16,19,52,51,20,51,17,22,5,0,16,10,27,39,4,2,4,7]
+; AVX512VLVBMI-NEXT:    vpermi2b %ymm0, %ymm1, %ymm2
+; AVX512VLVBMI-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39>
   ret <32 x i8> %shuffle
 }
@@ -1663,11 +1982,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,32,32,32,32,32,32,32,32,40,40,40,40,40,40,40,40]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
   ret <32 x i8> %shuffle
 }
@@ -1682,11 +2013,23 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24,32,32,32,32,32,32,32,32,40,40,40,40,40,40,40,40]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
   ret <32 x i8> %shuffle
 }
@@ -1702,11 +2045,23 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24,48,48,48,48,48,48,48,48,56,56,56,56,56,56,56,56]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
   ret <32 x i8> %shuffle
 }
@@ -1721,11 +2076,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,48,48,48,48,48,48,48,48,56,56,56,56,56,56,56,56]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
   ret <32 x i8> %shuffle
 }
@@ -1738,12 +2105,25 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2OR512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2OR512VL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512VLBW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
   ret <32 x i8> %shuffle
 }
@@ -1961,18 +2341,26 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    movl $286331153, %eax # imm = 0x11111111
-; AVX512VL-NEXT:    kmovd %eax, %k1
-; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm0 {%k1} {z}
-; AVX512VL-NEXT:    retq
+; AVX512VLBW-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VLBW-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512VLBW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512VLBW-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    movl $286331153, %eax # imm = 0x11111111
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [56,1,2,3,57,5,6,7,58,9,10,11,59,13,14,15,60,17,18,19,61,21,22,23,62,25,26,27,63,29,30,31]
+; AVX512VLVBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm0, %ymm2, %ymm1
+; AVX512VLVBMI-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 56, i32 1, i32 2, i32 3, i32 57, i32 5, i32 6, i32 7, i32 58, i32 9, i32 10, i32 11, i32 59, i32 13, i32 14, i32 15, i32 60, i32 17, i32 18, i32 19, i32 61, i32 21, i32 22, i32 23, i32 62, i32 25, i32 26, i32 27, i32 63, i32 29, i32 30, i32 31>
   ret <32 x i8> %shuffle
 }
@@ -2159,19 +2547,11 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
+; AVX2OR512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
   ret <32 x i8> %shuffle
 }
@@ -2203,37 +2583,21 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
+; AVX2OR512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <32 x i8> %shuffle
 }
 
 define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
-; AVX1OR2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; ALL-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <32 x i8> %shuffle
 }
@@ -2245,19 +2609,11 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <32 x i8> %shuffle
 }
@@ -2276,13 +2632,27 @@ define <32 x i8> @shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2OR512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2OR512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBW-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
   %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %3 = bitcast <16 x i16> %1 to <32 x i8>
@@ -2310,11 +2680,29 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: PR28136:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: PR28136:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: PR28136:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-SLOW-LABEL: PR28136:
+; AVX512VLVBMI-SLOW:       # %bb.0:
+; AVX512VLVBMI-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512VLVBMI-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLVBMI-SLOW-NEXT:    retq
+;
+; AVX512VLVBMI-FAST-LABEL: PR28136:
+; AVX512VLVBMI-FAST:       # %bb.0:
+; AVX512VLVBMI-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,16,48,17,49,18,50,19,51,4,36,5,37,6,38,7,39,20,52,21,53,22,54,23,55]
+; AVX512VLVBMI-FAST-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-FAST-NEXT:    retq
   %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50,i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
   %2 = bitcast <32 x i8> %1 to <4 x i64>
   %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 41dcb5032ee2..ebef762787d9 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
 
 define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_0000:
@@ -546,19 +548,29 @@ define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4f64_0z3z:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
-; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4f64_0z3z:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
-; AVX512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v4f64_0z3z:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; AVX2-SLOW-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v4f64_0z3z:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4f64_0z3z:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; AVX512VL-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4f64_0z3z:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 0, i32 4, i32 3, i32 4>
   ret <4 x double> %shuffle
 }
@@ -574,19 +586,29 @@ define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4f64_1z2z:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4f64_1z2z:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
-; AVX512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v4f64_1z2z:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v4f64_1z2z:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4f64_1z2z:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4f64_1z2z:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-FAST-NEXT:    retq
   %1 = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
   ret <4 x double> %1
 }
@@ -812,11 +834,17 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v4i64_0124:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vbroadcastsd %xmm1, %ymm1
-; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512VL-NEXT:    retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_0124:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vbroadcastsd %xmm1, %ymm1
+; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_0124:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,4]
+; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i64> %shuffle
 }
@@ -863,12 +891,19 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v4i64_0412:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX512VL-NEXT:    retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_0412:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_0412:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2]
+; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
   ret <4 x i64> %shuffle
 }
@@ -889,11 +924,17 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v4i64_4012:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2]
-; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512VL-NEXT:    retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_4012:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2]
+; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_4012:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,0,1,2]
+; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
   ret <4 x i64> %shuffle
 }
@@ -924,9 +965,8 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v4i64_0451:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,5,1]
+; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
   ret <4 x i64> %shuffle
@@ -958,9 +998,8 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v4i64_4015:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,0,1,5]
+; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
   ret <4 x i64> %shuffle
@@ -980,11 +1019,17 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v4i64_2u35:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
-; AVX512VL-NEXT:    retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_2u35:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_2u35:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,5,3,5]
+; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>
   ret <4 x i64> %shuffle
 }
@@ -1008,9 +1053,8 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v4i64_1251:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,1]
-; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,5,1]
+; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1>
   ret <4 x i64> %shuffle
@@ -1121,9 +1165,8 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
 ;
 ; AVX512VL-LABEL: shuffle_v4i64_0415:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,1,5]
+; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x i64> %shuffle
@@ -1564,19 +1607,29 @@ define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i64_z0z3:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
-; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i64_z0z3:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v4i64_z0z3:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v4i64_z0z3:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31]
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_z0z3:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; AVX512VL-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_z0z3:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31]
+; AVX512VL-FAST-NEXT:    retq
   %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 3>
   ret <4 x i64> %1
 }
@@ -1592,19 +1645,29 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i64_1z2z:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i64_1z2z:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0]
-; AVX512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v4i64_1z2z:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_1z2z:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v4i64_1z2z:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-FAST-NEXT:    retq
   %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
   ret <4 x i64> %1
 }
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 44d0217f5295..1b6df7dd2d25 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
 
 define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {
 ; AVX1-LABEL: shuffle_v8f32_00000000:
@@ -342,12 +344,26 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8f32_09ab1def:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2OR512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8f32_09ab1def:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_09ab1def:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8f32_09ab1def:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7]
+; AVX512VL-FAST-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
   ret <8 x float> %shuffle
 }
@@ -651,14 +667,23 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v8f32_c348cda0:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
-; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8f32_c348cda0:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
+; AVX2-SLOW-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8f32_c348cda0:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,3,4,7,4,7,2,0]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v8f32_c348cda0:
 ; AVX512VL:       # %bb.0:
@@ -681,14 +706,23 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v8f32_f511235a:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0]
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8f32_f511235a:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8f32_f511235a:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,2,3,7,6,3,2]
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [5,5,1,1,2,3,5,5]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v8f32_f511235a:
 ; AVX512VL:       # %bb.0:
@@ -722,11 +756,29 @@ define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8f32_76547654:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8f32_76547654:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8f32_76547654:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_76547654:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8f32_76547654:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
   ret <8 x float> %shuffle
 }
@@ -738,11 +790,29 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8f32_76543210:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8f32_76543210:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8f32_76543210:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_76543210:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8f32_76543210:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x float> %shuffle
 }
@@ -798,11 +868,23 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: PR21138:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: PR21138:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: PR21138:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: PR21138:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
+; AVX512VL-FAST-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   ret <8 x float> %shuffle
 }
@@ -1264,12 +1346,26 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_09ab1def:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_09ab1def:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_09ab1def:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_09ab1def:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7]
+; AVX512VL-FAST-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
   ret <8 x i32> %shuffle
 }
@@ -1696,13 +1792,21 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v8i32_6caa87e5:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
-; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3]
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8i32_6caa87e5:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i32_6caa87e5:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [4,4,2,2,0,0,6,6]
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-FAST-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v8i32_6caa87e5:
 ; AVX512VL:       # %bb.0:
@@ -1737,11 +1841,29 @@ define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_76547654:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8i32_76547654:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i32_76547654:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_76547654:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_76547654:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
   ret <8 x i32> %shuffle
 }
@@ -1753,11 +1875,29 @@ define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_76543210:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8i32_76543210:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuffle_v8i32_76543210:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_76543210:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_76543210:
+; AVX512VL-FAST:       # %bb.0:
+; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i32> %shuffle
 }
diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 3e49957bf85e..d4fb0fd52a79 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -186,8 +186,7 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19
 ;
 ; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[2,3,2,3,0,1,0,1,10,11,10,11,8,9,8,9,18,19,18,19,16,17,16,17,26,27,26,27,24,25,24,25,34,35,34,35,32,33,32,33,42,43,42,43,40,41,40,41,50,51,50,51,48,49,48,49,58,59,58,59,56,57,56,57]
 ; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28>
   ret <32 x i16> %c
@@ -354,3 +353,18 @@ define <8 x i16> @pr32967(<32 x i16> %v) {
  %shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> <i32 1,i32 5,i32 9,i32 13,i32 17,i32 21,i32 25,i32 29>
  ret <8 x i16> %shuffle
 }
+
+define <32 x i16> @shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz(<32 x i16> %a) {
+; KNL-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero
+; KNL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[14,15],zero,zero,ymm1[10,11],zero,zero,ymm1[6,7],zero,zero,ymm1[2,3],zero,zero,ymm1[30,31],zero,zero,ymm1[26,27],zero,zero,ymm1[22,23],zero,zero,ymm1[20,21],zero,zero
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[14,15],zero,zero,zmm0[10,11],zero,zero,zmm0[6,7],zero,zero,zmm0[2,3],zero,zero,zmm0[30,31],zero,zero,zmm0[26,27],zero,zero,zmm0[22,23],zero,zero,zmm0[18,19],zero,zero,zmm0[46,47],zero,zero,zmm0[42,43],zero,zero,zmm0[38,39],zero,zero,zmm0[34,35],zero,zero,zmm0[62,63],zero,zero,zmm0[58,59],zero,zero,zmm0[54,55],zero,zero,zmm0[52,53],zero,zero
+; SKX-NEXT:    retq
+  %shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 39, i32 0, i32 37, i32 0, i32 35, i32 0, i32 33, i32 0, i32 47, i32 0, i32 45, i32 0, i32 43, i32 0, i32 41, i32 0, i32 55, i32 0, i32 53, i32 0, i32 51, i32 0, i32 49, i32 0, i32 63, i32 0, i32 61, i32 0, i32 59, i32 0, i32 58, i32 0>
+  ret <32 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index 9c92ca756ebd..f3433ce834cd 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -48,9 +48,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
-; AVX512VL-NEXT:    movb $1, %al
-; AVX512VL-NEXT:    kmovw %eax, %k1
-; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm2 {%k1} {z}
+; AVX512VL-NEXT:    movq $-1, %rax
+; AVX512VL-NEXT:    vmovq %rax, %xmm2
 ; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
@@ -61,9 +60,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; VL_BW_DQ:       # %bb.0:
 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
-; VL_BW_DQ-NEXT:    movb $1, %al
-; VL_BW_DQ-NEXT:    kmovd %eax, %k1
-; VL_BW_DQ-NEXT:    vpmovm2q %k1, %xmm0
+; VL_BW_DQ-NEXT:    movq $-1, %rax
+; VL_BW_DQ-NEXT:    vmovq %rax, %xmm0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm1
 ; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
@@ -123,12 +121,12 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
 ; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
-; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
-; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; AVX512VL-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
+; AVX512VL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
 ; AVX512VL-NEXT:    vzeroupper
@@ -137,10 +135,10 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
 ; VL_BW_DQ:       # %bb.0:
 ; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
-; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
-; VL_BW_DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
+; VL_BW_DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
 ; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
@@ -250,12 +248,12 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
 ; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    kmovw %edi, %k1
-; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vpbroadcastq %xmm0, %zmm0
-; AVX512VL-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX512VL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX512VL-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
 ; AVX512VL-NEXT:    vzeroupper
@@ -264,10 +262,10 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
 ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
 ; VL_BW_DQ:       # %bb.0:
 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
-; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %zmm0
-; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %ymm0
+; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
 ; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
@@ -294,12 +292,14 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
 ; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    kmovw %edi, %k1
-; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
-; AVX512VL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512VL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
+; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovw %k0, %eax
 ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax
 ; AVX512VL-NEXT:    vzeroupper
@@ -308,11 +308,12 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
 ; VL_BW_DQ:       # %bb.0:
 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
-; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; VL_BW_DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
-; VL_BW_DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
+; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
 ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax
 ; VL_BW_DQ-NEXT:    vzeroupper
@@ -339,10 +340,11 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
 ; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    kmovw %edi, %k1
-; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
-; AVX512VL-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovw %k0, %eax
 ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax
 ; AVX512VL-NEXT:    vzeroupper
@@ -351,9 +353,9 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
 ; VL_BW_DQ:       # %bb.0:
 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
-; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
-; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
 ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax
 ; VL_BW_DQ-NEXT:    vzeroupper
@@ -382,12 +384,13 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
 ; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    kmovw %edi, %k1
-; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
-; AVX512VL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512VL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
+; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpslld $31, %ymm2, %ymm0
+; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovw %k0, %eax
 ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax
 ; AVX512VL-NEXT:    vzeroupper
@@ -396,11 +399,11 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
 ; VL_BW_DQ:       # %bb.0:
 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
-; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
 ; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
-; VL_BW_DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
+; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
 ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax
 ; VL_BW_DQ-NEXT:    vzeroupper
@@ -429,12 +432,13 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    kmovw %edi, %k1
-; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
-; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512VL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7]
+; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovw %k0, %eax
 ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax
 ; AVX512VL-NEXT:    vzeroupper
@@ -443,11 +447,11 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
 ; VL_BW_DQ:       # %bb.0:
 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
-; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
-; VL_BW_DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
+; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7]
+; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
 ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax
 ; VL_BW_DQ-NEXT:    vzeroupper
@@ -462,12 +466,10 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    movb $51, %al
-; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512F-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0]
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
@@ -478,14 +480,12 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
 ; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    kmovw %edi, %k1
-; AVX512VL-NEXT:    movb $51, %al
-; AVX512VL-NEXT:    kmovw %eax, %k2
-; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512VL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
-; AVX512VL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512VL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovw %k0, %eax
 ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax
 ; AVX512VL-NEXT:    vzeroupper
@@ -494,11 +494,10 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
 ; VL_BW_DQ:       # %bb.0:
 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
-; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
-; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0]
-; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
+; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
 ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax
 ; VL_BW_DQ-NEXT:    vzeroupper
@@ -528,15 +527,15 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
 ;
 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512VL-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512VL-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512VL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovw %k0, %eax
 ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax
 ; AVX512VL-NEXT:    vzeroupper
@@ -546,11 +545,11 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
 ; VL_BW_DQ:       # %bb.0:
 ; VL_BW_DQ-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0
-; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
-; VL_BW_DQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
-; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
+; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
+; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; VL_BW_DQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
 ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax
 ; VL_BW_DQ-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll
index 0367737dda60..4de24d5fec4d 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -414,63 +414,62 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE2-NEXT:    movd %eax, %xmm8
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE2-NEXT:    movd %eax, %xmm15
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE2-NEXT:    movd %eax, %xmm9
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE2-NEXT:    movd %eax, %xmm10
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE2-NEXT:    movd %eax, %xmm7
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE2-NEXT:    movd %eax, %xmm11
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE2-NEXT:    movd %eax, %xmm6
-; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %eax
 ; SSE2-NEXT:    movd %eax, %xmm12
-; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl -24(%rsp,%rdx), %eax
 ; SSE2-NEXT:    movd %eax, %xmm5
-; SSE2-NEXT:    andl $15, %r9d
-; SSE2-NEXT:    movzbl (%r9,%r10), %eax
+; SSE2-NEXT:    andl $15, %esi
+; SSE2-NEXT:    movzbl -24(%rsp,%rsi), %eax
 ; SSE2-NEXT:    movd %eax, %xmm13
-; SSE2-NEXT:    andl $15, %r8d
-; SSE2-NEXT:    movzbl (%r8,%r10), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    andl $15, %ecx
-; SSE2-NEXT:    movzbl (%rcx,%r10), %eax
+; SSE2-NEXT:    andl $15, %edi
+; SSE2-NEXT:    movzbl -24(%rsp,%rdi), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    andl $15, %r9d
+; SSE2-NEXT:    movzbl -24(%rsp,%r9), %eax
 ; SSE2-NEXT:    movd %eax, %xmm14
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%r10), %eax
+; SSE2-NEXT:    andl $15, %r8d
+; SSE2-NEXT:    movzbl -24(%rsp,%r8), %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    andl $15, %esi
-; SSE2-NEXT:    movzbl (%rsi,%r10), %eax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    andl $15, %edi
-; SSE2-NEXT:    movzbl (%rdi,%r10), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
@@ -479,12 +478,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
 ; SSE2-NEXT:    retq
 ;
@@ -499,63 +498,62 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %r10
-; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm8
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm15
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm9
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm10
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm7
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm11
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm6
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm12
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rdx), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm5
-; SSSE3-NEXT:    andl $15, %r9d
-; SSSE3-NEXT:    movzbl (%r9,%r10), %eax
+; SSSE3-NEXT:    andl $15, %esi
+; SSSE3-NEXT:    movzbl -24(%rsp,%rsi), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm13
-; SSSE3-NEXT:    andl $15, %r8d
-; SSSE3-NEXT:    movzbl (%r8,%r10), %eax
-; SSSE3-NEXT:    movd %eax, %xmm4
-; SSSE3-NEXT:    andl $15, %ecx
-; SSSE3-NEXT:    movzbl (%rcx,%r10), %eax
+; SSSE3-NEXT:    andl $15, %edi
+; SSSE3-NEXT:    movzbl -24(%rsp,%rdi), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    andl $15, %r9d
+; SSSE3-NEXT:    movzbl -24(%rsp,%r9), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm14
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%r10), %eax
+; SSSE3-NEXT:    andl $15, %r8d
+; SSSE3-NEXT:    movzbl -24(%rsp,%r8), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    andl $15, %esi
-; SSSE3-NEXT:    movzbl (%rsi,%r10), %eax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
+; SSSE3-NEXT:    movd %eax, %xmm4
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    andl $15, %edi
-; SSSE3-NEXT:    movzbl (%rdi,%r10), %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
@@ -564,12 +562,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
 ; SSSE3-NEXT:    retq
 ;
@@ -583,49 +581,48 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE41-NEXT:    # kill: def %edi killed %edi def %rdi
 ; SSE41-NEXT:    andl $15, %edi
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; SSE41-NEXT:    movzbl (%rdi,%rax), %edi
-; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    movzbl -24(%rsp,%rdi), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
 ; SSE41-NEXT:    andl $15, %esi
-; SSE41-NEXT:    pinsrb $1, (%rsi,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $1, -24(%rsp,%rsi), %xmm0
 ; SSE41-NEXT:    andl $15, %edx
-; SSE41-NEXT:    pinsrb $2, (%rdx,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $2, -24(%rsp,%rdx), %xmm0
 ; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $3, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $3, -24(%rsp,%rcx), %xmm0
 ; SSE41-NEXT:    andl $15, %r8d
-; SSE41-NEXT:    pinsrb $4, (%r8,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $4, -24(%rsp,%r8), %xmm0
 ; SSE41-NEXT:    andl $15, %r9d
-; SSE41-NEXT:    pinsrb $5, (%r9,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $6, (%rcx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $7, (%rcx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $8, (%rcx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $9, (%rcx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $10, (%rcx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $11, (%rcx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $12, (%rcx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $13, (%rcx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $14, (%rcx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    pinsrb $15, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $5, -24(%rsp,%r9), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $6, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $7, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $8, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $9, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $10, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $11, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $12, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $13, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $14, -24(%rsp,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $15, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
@@ -638,49 +635,48 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; AVX-NEXT:    # kill: def %edi killed %edi def %rdi
 ; AVX-NEXT:    andl $15, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movzbl (%rdi,%rax), %edi
-; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    movzbl -24(%rsp,%rdi), %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
 ; AVX-NEXT:    andl $15, %esi
-; AVX-NEXT:    vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $1, -24(%rsp,%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    andl $15, %edx
-; AVX-NEXT:    vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $3, -24(%rsp,%rcx), %xmm0, %xmm0
 ; AVX-NEXT:    andl $15, %r8d
-; AVX-NEXT:    vpinsrb $4, (%r8,%rax), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0
 ; AVX-NEXT:    andl $15, %r9d
-; AVX-NEXT:    vpinsrb $5, (%r9,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $6, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $7, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $8, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $9, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $10, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $11, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $12, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $13, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $14, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    vpinsrb $15, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x0  = extractelement <16 x i8> %x, i8 %i0
   %x1  = extractelement <16 x i8> %x, i8 %i1
@@ -819,69 +815,68 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movzbl (%rdi), %eax
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl 15(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm8
-; SSE2-NEXT:    movzbl 14(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm15
-; SSE2-NEXT:    movzbl 13(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm9
-; SSE2-NEXT:    movzbl 12(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm3
-; SSE2-NEXT:    movzbl 11(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm10
-; SSE2-NEXT:    movzbl 10(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm7
-; SSE2-NEXT:    movzbl 9(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm11
-; SSE2-NEXT:    movzbl 8(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm6
-; SSE2-NEXT:    movzbl 7(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm12
-; SSE2-NEXT:    movzbl 6(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm5
-; SSE2-NEXT:    movzbl 5(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm13
-; SSE2-NEXT:    movzbl 4(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm4
-; SSE2-NEXT:    movzbl 3(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm14
-; SSE2-NEXT:    movzbl 2(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm1
-; SSE2-NEXT:    movzbl 1(%rdi), %edx
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSE2-NEXT:    movd %edx, %xmm2
+; SSE2-NEXT:    movzbl 15(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm8
+; SSE2-NEXT:    movzbl 14(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm15
+; SSE2-NEXT:    movzbl 13(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm9
+; SSE2-NEXT:    movzbl 12(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    movzbl 11(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm10
+; SSE2-NEXT:    movzbl 10(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm7
+; SSE2-NEXT:    movzbl 9(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm11
+; SSE2-NEXT:    movzbl 8(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm6
+; SSE2-NEXT:    movzbl 7(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm12
+; SSE2-NEXT:    movzbl 6(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm5
+; SSE2-NEXT:    movzbl 5(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm13
+; SSE2-NEXT:    movzbl 4(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm4
+; SSE2-NEXT:    movzbl 3(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm14
+; SSE2-NEXT:    movzbl 2(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movzbl 1(%rdi), %ecx
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE2-NEXT:    movd %eax, %xmm0
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -904,69 +899,68 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movzbl (%rdi), %eax
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl 15(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm8
-; SSSE3-NEXT:    movzbl 14(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm15
-; SSSE3-NEXT:    movzbl 13(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm9
-; SSSE3-NEXT:    movzbl 12(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm3
-; SSSE3-NEXT:    movzbl 11(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm10
-; SSSE3-NEXT:    movzbl 10(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm7
-; SSSE3-NEXT:    movzbl 9(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm11
-; SSSE3-NEXT:    movzbl 8(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm6
-; SSSE3-NEXT:    movzbl 7(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm12
-; SSSE3-NEXT:    movzbl 6(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm5
-; SSSE3-NEXT:    movzbl 5(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm13
-; SSSE3-NEXT:    movzbl 4(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm4
-; SSSE3-NEXT:    movzbl 3(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm14
-; SSSE3-NEXT:    movzbl 2(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm1
-; SSSE3-NEXT:    movzbl 1(%rdi), %edx
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
-; SSSE3-NEXT:    movd %edx, %xmm2
+; SSSE3-NEXT:    movzbl 15(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm8
+; SSSE3-NEXT:    movzbl 14(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm15
+; SSSE3-NEXT:    movzbl 13(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm9
+; SSSE3-NEXT:    movzbl 12(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    movzbl 11(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm10
+; SSSE3-NEXT:    movzbl 10(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm7
+; SSSE3-NEXT:    movzbl 9(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm11
+; SSSE3-NEXT:    movzbl 8(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm6
+; SSSE3-NEXT:    movzbl 7(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm12
+; SSSE3-NEXT:    movzbl 6(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm5
+; SSSE3-NEXT:    movzbl 5(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm13
+; SSSE3-NEXT:    movzbl 4(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm4
+; SSSE3-NEXT:    movzbl 3(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm14
+; SSSE3-NEXT:    movzbl 2(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movzbl 1(%rdi), %ecx
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm0
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -990,54 +984,53 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ; SSE41-NEXT:    movzbl (%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE41-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; SSE41-NEXT:    movd %eax, %xmm0
 ; SSE41-NEXT:    movzbl 1(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $1, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $1, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 2(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $2, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $2, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 3(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $3, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $3, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 4(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $4, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $4, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 5(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $5, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $5, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 6(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $6, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $6, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 7(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $7, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $7, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 8(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $8, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $8, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 9(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $9, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $9, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 10(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $10, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $10, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 11(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $11, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $11, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 12(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $12, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $12, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 13(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $13, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $13, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 14(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $14, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $14, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    movzbl 15(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    pinsrb $15, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    pinsrb $15, -24(%rsp,%rax), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
@@ -1045,54 +1038,53 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ; AVX-NEXT:    movzbl (%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX-NEXT:    movzbl -24(%rsp,%rax), %eax
 ; AVX-NEXT:    vmovd %eax, %xmm0
 ; AVX-NEXT:    movzbl 1(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 2(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 3(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 4(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 5(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 6(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 7(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 8(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 9(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 10(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 11(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 12(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 13(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 14(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 15(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %p0  = getelementptr inbounds i8, i8* %i, i64 0
   %p1  = getelementptr inbounds i8, i8* %i, i64 1
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index d25117ca715c..fd4c30fb327b 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -1504,12 +1504,9 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
 ; AVX512VL:       # %bb.0: # %entry
 ; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
 ; AVX512VL-NEXT:    vpmovqd %ymm1, %xmm1
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -1531,12 +1528,9 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
 ; AVX512BWVL:       # %bb.0: # %entry
 ; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vpmovqd %ymm1, %xmm1
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
@@ -1647,43 +1641,13 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc2x4i32_8i16:
-; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc2x4i32_8i16:
-; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc2x4i32_8i16:
-; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc2x4i32_8i16:
-; AVX512BWVL:       # %bb.0: # %entry
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: trunc2x4i32_8i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    retq
 entry:
   %0 = trunc <4 x i32> %a to <4 x i16>
   %1 = trunc <4 x i32> %b to <4 x i16>
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index 94eadd8c1aaf..3ea65573fc70 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -3,7 +3,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
 
@@ -1911,11 +1912,28 @@ define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable
 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT:    retq
+; AVX1-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX2-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX2-FAST:       # %bb.0: # %entry
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero
+; AVX2-FAST-NEXT:    retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    retq
 entry:
   %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8>
   %Z = bitcast <8 x i16> %B to <4 x i32>
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 985f6a861b93..3d4355e5f39c 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll
@@ -182,7 +182,7 @@ define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: test11:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [0,65535,65535,0,0,65535,65535,0]
 ; SSE2-NEXT:    andps %xmm2, %xmm0
 ; SSE2-NEXT:    andnps %xmm1, %xmm2
 ; SSE2-NEXT:    orps %xmm2, %xmm0
@@ -190,12 +190,12 @@ define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; SSE41-LABEL: test11:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test11:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7]
 ; AVX-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s b/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s
index 9158c60fbeb5..fb095e3ebc25 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s
@@ -2,9 +2,10 @@
 # RUN: llvm-dwarfdump -v %t.o | FileCheck %s
 
 # Test object to verify that dwarfdump handles dwp files with DWARF v5 string
-# offset tables. We have 2 CUs and 2 TUs, where it is assumed that 
+# offset tables. We have 3 CUs and 2 TUs, where it is assumed that 
 # CU1 and TU1 came from one object file, CU2 and TU2 from a second object
-# file.
+# file, and CU3 from a third object file that was compiled with 
+# -gdwarf-4.
 #
         .section .debug_str.dwo,"MS",@progbits,1
 str_producer:
@@ -25,56 +26,50 @@ str_TU2:
         .asciz "Type_Unit_2"
 str_TU2_type:
         .asciz "MyStruct_2"
+str_CU3:
+        .asciz "Compile_Unit_3"
+str_CU3_dir:
+        .asciz "/home/test/CU3"
 
         .section .debug_str_offsets.dwo,"",@progbits
 # Object files 1's portion of the .debug_str_offsets.dwo section.
-.debug_str_offsets_object_file1:
-
-# CU1's contribution (from object file 1)
-.debug_str_offsets_start_CU1:
-        .long .debug_str_offsets_end_CU1-.debug_str_offsets_base_CU1
+# CU1 and TU1 share a contribution to the string offsets table.
+.debug_str_offsets_object_file1_start:
+        .long .debug_str_offsets_object_file1_end-.debug_str_offsets_base_1
         .short 5    # DWARF version
         .short 0    # Padding
-.debug_str_offsets_base_CU1:
+.debug_str_offsets_base_1:
         .long str_producer-.debug_str.dwo
         .long str_CU1-.debug_str.dwo
         .long str_CU1_dir-.debug_str.dwo
-.debug_str_offsets_end_CU1:
-
-# TU1's contribution (from object file 1)
-.debug_str_offsets_start_TU1:
-        .long .debug_str_offsets_end_TU1-.debug_str_offsets_base_TU1
-        .short 5    # DWARF version
-        .short 0    # Padding
-.debug_str_offsets_base_TU1:
         .long str_TU1-.debug_str.dwo
         .long str_TU1_type-.debug_str.dwo
-.debug_str_offsets_end_TU1:
+.debug_str_offsets_object_file1_end:
 
 # Object files 2's portion of the .debug_str_offsets.dwo section.
-.debug_str_offsets_object_file2:
-
-# CU2's contribution (from object file 2)
-.debug_str_offsets_start_CU2:
-        .long .debug_str_offsets_end_CU2-.debug_str_offsets_base_CU2
+# CU2 and TU2 share a contribution to the string offsets table.
+.debug_str_offsets_object_file2_start:
+        .long .debug_str_offsets_object_file2_end-.debug_str_offsets_base_2
         .short 5    # DWARF version
         .short 0    # Padding
-.debug_str_offsets_base_CU2:
+.debug_str_offsets_base_2:
         .long str_producer-.debug_str.dwo
         .long str_CU2-.debug_str.dwo
         .long str_CU2_dir-.debug_str.dwo
-.debug_str_offsets_end_CU2:
-
-# TU2's contribution (from object file 2)
-.debug_str_offsets_start_TU2:
-        .long .debug_str_offsets_end_TU2-.debug_str_offsets_base_TU2
-        .short 5    # DWARF version
-        .short 0    # Padding
-.debug_str_offsets_base_TU2:
         .long str_TU2-.debug_str.dwo
         .long str_TU2_type-.debug_str.dwo
-.debug_str_offsets_end_TU2:
+.debug_str_offsets_object_file2_end:
 
+# Object files 3's portion of the .debug_str_offsets.dwo section.
+# This file is assumed to have been compiled with -gdwarf-4 and
+# therefore contains a version 4 CU and a GNU format contribution
+# to the .debug_str_offsets section.
+.debug_str_offsets_object_file3_start:
+.debug_str_offsets_base_3:
+        .long str_producer-.debug_str.dwo
+        .long str_CU3-.debug_str.dwo
+        .long str_CU3_dir-.debug_str.dwo
+.debug_str_offsets_object_file3_end:
 
 # Abbrevs are shared for all compile and type units
         .section .debug_abbrev.dwo,"",@progbits
@@ -85,8 +80,6 @@ str_TU2_type:
         .byte 0x1a  # DW_FORM_strx
         .byte 0x03  # DW_AT_name
         .byte 0x1a  # DW_FORM_strx
-        .byte 0x72  # DW_AT_str_offsets_base
-        .byte 0x17  # DW_FORM_sec_offset
         .byte 0x03  # DW_AT_name
         .byte 0x1a  # DW_FORM_strx
         .byte 0x00  # EOM(1)
@@ -96,8 +89,6 @@ str_TU2_type:
         .byte 0x01  # DW_CHILDREN_yes
         .byte 0x03  # DW_AT_name
         .byte 0x1a  # DW_FORM_strx
-        .byte 0x72  # DW_AT_str_offsets_base
-        .byte 0x17  # DW_FORM_sec_offset
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x03  # Abbrev code
@@ -107,6 +98,17 @@ str_TU2_type:
         .byte 0x1a  # DW_FORM_strx
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
+        .byte 0x04  # Abbrev code
+        .byte 0x11  # DW_TAG_compile_unit
+        .byte 0x00  # DW_CHILDREN_no
+        .byte 0x25  # DW_AT_producer
+        .short 0x3e82  # DW_FORM_GNU_str_index
+        .byte 0x03  # DW_AT_name
+        .short 0x3e82  # DW_FORM_GNU_str_index
+        .byte 0x03  # DW_AT_name
+        .short 0x3e82  # DW_FORM_GNU_str_index
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
         .byte 0x00  # EOM(3)
 abbrev_end:
 
@@ -120,15 +122,11 @@ CU1_5_version:
         .byte 1                # DWARF Unit Type
         .byte 8                # Address Size (in bytes)
         .long .debug_abbrev.dwo # Offset Into Abbrev. Section
-# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name,
-# DW_AT_str_offsets and DW_AT_compdir.
+# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name
+# and DW_AT_compdir.
         .byte 1                # Abbreviation code
         .byte 0                # The index of the producer string
         .byte 1                # The index of the CU name string
-# The DW_AT_str_offsets_base attribute for CU1 contains the offset of CU1's
-# contribution relative to the start of object file 1's portion of the
-# .debug_str_offsets section.
-        .long .debug_str_offsets_base_CU1-.debug_str_offsets_object_file1
         .byte 2                # The index of the comp dir string
         .byte 0 # NULL
 CU1_5_end:
@@ -140,19 +138,30 @@ CU2_5_version:
         .byte 1                # DWARF Unit Type
         .byte 8                # Address Size (in bytes)
         .long .debug_abbrev.dwo # Offset Into Abbrev. Section
-# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name,
-# DW_AT_str_offsets and DW_AT_compdir.
+# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name
+# and DW_AT_compdir.
         .byte 1                # Abbreviation code
         .byte 0                # The index of the producer string
         .byte 1                # The index of the CU name string
-# The DW_AT_str_offsets_base attribute for CU2 contains the offset of CU2's
-# contribution relative to the start of object file 2's portion of the
-# .debug_str_offsets section.
-        .long .debug_str_offsets_base_CU2-.debug_str_offsets_object_file2
         .byte 2                # The index of the comp dir string
         .byte 0 # NULL
 CU2_5_end:
 
+CU3_4_start:
+        .long  CU3_4_end-CU3_4_version  # Length of Unit
+CU3_4_version:
+        .short 4               # DWARF version number
+        .long .debug_abbrev.dwo # Offset Into Abbrev. Section
+        .byte 8                # Address Size (in bytes)
+# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name
+# and DW_AT_compdir.
+        .byte 4                # Abbreviation code
+        .byte 0                # The index of the producer string
+        .byte 1                # The index of the CU name string
+        .byte 2                # The index of the comp dir string
+        .byte 0 # NULL
+CU3_4_end:
+
         .section .debug_types.dwo,"",@progbits
 # DWARF v5 Type unit header.
 TU1_5_start:
@@ -166,15 +175,11 @@ TU1_5_version:
         .long TU1_5_type-TU1_5_start # Type offset
 # The type-unit DIE, which has a name.
         .byte 2                # Abbreviation code
-        .byte 0                # Index of the unit type name string
-# The DW_AT_str_offsets_base attribute for TU1 contains the offset of TU1's
-# contribution relative to the start of object file 1's portion of the
-# .debug_str_offsets section.
-        .long .debug_str_offsets_base_TU1-.debug_str_offsets_object_file1
+        .byte 3                # Index of the unit type name string
 # The type DIE, which has a name.
 TU1_5_type:
         .byte 3                # Abbreviation code
-        .byte 1                # Index of the type name string
+        .byte 4                # Index of the type name string
         .byte 0 # NULL
         .byte 0 # NULL
 TU1_5_end:
@@ -190,15 +195,11 @@ TU2_5_version:
         .long TU2_5_type-TU2_5_start # Type offset
 # The type-unit DIE, which has a name.
         .byte 2                # Abbreviation code
-        .byte 0                # Index of the unit type name string
-# The DW_AT_str_offsets_base attribute for TU2 contains the offset of TU2's
-# contribution relative to the start of object file 2's portion of the
-# .debug_str_offsets section.
-        .long .debug_str_offsets_base_TU2-.debug_str_offsets_object_file2
+        .byte 3                # Index of the unit type name string
 # The type DIE, which has a name.
 TU2_5_type:
         .byte 3                # Abbreviation code
-        .byte 1                # Index of the type name string
+        .byte 4                # Index of the type name string
         .byte 0 # NULL
         .byte 0 # NULL
 TU2_5_end:
@@ -207,37 +208,45 @@ TU2_5_end:
         # The index header
         .long 2                # Version 
         .long 3                # Columns of contribution matrix
-        .long 2                # number of units
-        .long 2                # number of hash buckets in table
+        .long 3                # number of units
+        .long 3                # number of hash buckets in table
 
-        # The signatures for both CUs.
+        # The signatures for all CUs.
         .quad 0xddeeaaddbbaabbee # signature 1
         .quad 0xff00ffeeffaaff00 # signature 2
+        .quad 0xf00df00df00df00d # signature 2
         # The indexes for both CUs.
         .long 1                # index 1
         .long 2                # index 2
-        # The sections to which both CUs contribute.
+        .long 3                # index 3
+        # The sections to which all CUs contribute.
         .long 1                # DW_SECT_INFO
         .long 3                # DW_SECT_ABBREV
         .long 6                # DW_SECT_STR_OFFSETS
 
-        # The starting offsets of both CU's contributions to info,
+        # The starting offsets of all CU's contributions to info,
         # abbrev and string offsets table.
         .long CU1_5_start-.debug_info.dwo                   
         .long 0
-        .long .debug_str_offsets_object_file1-.debug_str_offsets.dwo
+        .long .debug_str_offsets_object_file1_start-.debug_str_offsets.dwo
         .long CU2_5_start-.debug_info.dwo
         .long 0
-        .long .debug_str_offsets_object_file2-.debug_str_offsets.dwo
+        .long .debug_str_offsets_object_file2_start-.debug_str_offsets.dwo
+        .long CU3_4_start-.debug_info.dwo
+        .long 0
+        .long .debug_str_offsets_object_file3_start-.debug_str_offsets.dwo
 
-        # The lengths of both CU's contributions to info, abbrev and
+        # The lengths of all CU's contributions to info, abbrev and
         # string offsets table.
         .long CU1_5_end-CU1_5_start
         .long abbrev_end-.debug_abbrev.dwo
-        .long .debug_str_offsets_end_CU1-.debug_str_offsets_start_CU1
+        .long .debug_str_offsets_object_file1_end-.debug_str_offsets_object_file1_start
         .long CU2_5_end-CU2_5_start
         .long abbrev_end-.debug_abbrev.dwo
-        .long .debug_str_offsets_end_CU2-.debug_str_offsets_start_CU2
+        .long .debug_str_offsets_object_file2_end-.debug_str_offsets_object_file2_start
+        .long CU3_4_end-CU3_4_start
+        .long abbrev_end-.debug_abbrev.dwo
+        .long .debug_str_offsets_object_file3_end-.debug_str_offsets_object_file3_start
 
         .section .debug_tu_index,"",@progbits
         # The index header
@@ -261,19 +270,19 @@ TU2_5_end:
         # abbrev and string offsets table.
         .long TU1_5_start-.debug_types.dwo
         .long 0
-        .long .debug_str_offsets_object_file1-.debug_str_offsets.dwo
+        .long .debug_str_offsets_object_file1_start-.debug_str_offsets.dwo
         .long TU2_5_start-.debug_types.dwo
         .long 0
-        .long .debug_str_offsets_object_file2-.debug_str_offsets.dwo
+        .long .debug_str_offsets_object_file2_start-.debug_str_offsets.dwo
 
         # The lengths of both TU's contributions to info, abbrev and
         # string offsets table.
         .long TU1_5_end-TU1_5_start
         .long abbrev_end-.debug_abbrev.dwo
-        .long .debug_str_offsets_end_TU1-.debug_str_offsets_start_TU1
+        .long .debug_str_offsets_object_file1_end-.debug_str_offsets_object_file1_start
         .long TU2_5_end-TU2_5_start
         .long abbrev_end-.debug_abbrev.dwo
-        .long .debug_str_offsets_end_TU2-.debug_str_offsets_start_TU2
+        .long .debug_str_offsets_object_file2_end-.debug_str_offsets_object_file2_start
 
 
 # Verify that the correct strings from each unit are displayed and that the
@@ -284,7 +293,6 @@ TU2_5_end:
 # CHECK:      DW_TAG_compile_unit
 # CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
 # CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_1")
-# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
 # CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU1")
 # CHECK-NOT:  NULL
 
@@ -293,26 +301,23 @@ TU2_5_end:
 # CHECK:      DW_TAG_compile_unit
 # CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
 # CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2")
-# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
 # CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2")
 # 
 # CHECK:      Type Unit
 # CHECK-NOT:  NULL
 # CHECK:      DW_TAG_type_unit
-# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit_1")
-# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c)
+# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000003) string = "Type_Unit_1")
 # CHECK-NOT:  NULL
 # CHECK:      DW_TAG_structure_type
-# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct_1")
+# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000004) string = "MyStruct_1")
 #
 # CHECK:      Type Unit
 # CHECK-NOT:  NULL
 # CHECK:      DW_TAG_type_unit
-# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit_2")
-# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c)
+# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000003) string = "Type_Unit_2")
 # CHECK-NOT:  NULL
 # CHECK:      DW_TAG_structure_type
-# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct_2")
+# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000004) string = "MyStruct_2")
 
 # Verify the correct offets of the compile and type units contributions in the
 # index tables.
@@ -322,11 +327,11 @@ TU2_5_end:
 # CHECK:      1 0xddeeaaddbbaabbee [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}})
 # CHECK-SAME: [0x00000000
 # CHECK-NEXT: 2 0xff00ffeeffaaff00 [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}})
-# CHECK-SAME: [0x00000024
+# CHECK-SAME: [0x0000001c
 
 # CHECK:      .debug_tu_index contents:
 # CHECK-NOT:  contents:
 # CHECK:      1 0xeeaaddbbaabbeedd [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}})
 # CHECK-SAME: [0x00000000
 # CHECK-NEXT: 2 0x00ffeeffaaff00ff [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}})
-# CHECK:      [0x00000024
+# CHECK:      [0x0000001c
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s
index ebde22213728..0ff6fdbb0aa7 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s
@@ -37,6 +37,8 @@ dwo_str_TU_5_type:
         .byte 0x01  # Abbrev code
         .byte 0x11  # DW_TAG_compile_unit
         .byte 0x00  # DW_CHILDREN_no
+        .byte 0x72  # DW_AT_str_offsets_base
+        .byte 0x17  # DW_FORM_sec_offset
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x00  # EOM(3)
@@ -54,13 +56,13 @@ CU1_5_version:
         .long .debug_abbrev    # Offset Into Abbrev. Section
 # A compile-unit DIE, which has no attributes.
         .byte 1                # Abbreviation code
+        .long .debug_str_offsets_base0
 CU1_5_end:
 
         .section .debug_str_offsets,"",@progbits
 # CU1's contribution
 # Invalid length
         .long 0xfffffffe
-        .long .debug_str_offsets_segment0_end-.debug_str_offsets_base0
         .short 5    # DWARF version
         .short 0    # Padding
 .debug_str_offsets_base0:
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s
index 9c6094ebade1..36ac7124a9f0 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s
@@ -15,6 +15,8 @@ str_CU1:
         .byte 0x01  # Abbrev code
         .byte 0x11  # DW_TAG_compile_unit
         .byte 0x00  # DW_CHILDREN_no
+        .byte 0x72  # DW_AT_str_offsets_base
+        .byte 0x17  # DW_FORM_sec_offset
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x00  # EOM(3)
@@ -32,6 +34,7 @@ CU1_5_version:
         .long .debug_abbrev    # Offset Into Abbrev. Section
 # A compile-unit DIE, which has no attributes.
         .byte 1                # Abbreviation code
+        .long .debug_str_offsets_base0
 CU1_5_end:
 
 # Every unit contributes to the string_offsets table.
@@ -50,4 +53,4 @@ CU1_5_end:
 
 # INVALIDLENGTH:             .debug_str_offsets contents:
 # INVALIDLENGTH-NOT:         contents:
-# INVALIDLENGTH:             error: contribution to string offsets table in section .debug_str_offsets has invalid length.
+# INVALIDLENGTH:             error: invalid contribution to string offsets table in section .debug_str_offsets.
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s
new file mode 100644
index 000000000000..28c4a418d125
--- /dev/null
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s
@@ -0,0 +1,94 @@
+# RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o %t.o
+# RUN: llvm-dwarfdump -v %t.o | FileCheck --check-prefix=OVERLAP %s
+#
+# Test object to verify that llvm-dwarfdump handles an invalid string offsets
+# table with overlapping contributions.
+
+        .section .debug_str,"MS",@progbits,1
+str_producer:
+        .asciz "Handmade DWARF producer"
+str_CU1:
+        .asciz "Compile_Unit_1"
+str_CU1_dir:
+        .asciz "/home/test/CU1"
+str_CU2:
+        .asciz "Compile_Unit_2"
+str_CU2_dir:
+        .asciz "/home/test/CU2"
+str_TU:
+        .asciz "Type_Unit"
+str_TU_type:
+        .asciz "MyStruct"
+
+        .section .debug_str.dwo,"MS",@progbits,1
+dwo_str_CU_5_producer:
+        .asciz "Handmade split DWARF producer"
+dwo_str_CU_5_name:
+        .asciz "V5_split_compile_unit"
+dwo_str_CU_5_comp_dir:
+        .asciz "/home/test/splitCU"
+dwo_str_TU_5:
+        .asciz "V5_split_type_unit"
+dwo_str_TU_5_type:
+        .asciz "V5_split_Mystruct"
+
+# A rudimentary abbrev section.
+        .section .debug_abbrev,"",@progbits
+        .byte 0x01  # Abbrev code
+        .byte 0x11  # DW_TAG_compile_unit
+        .byte 0x00  # DW_CHILDREN_no
+        .byte 0x72  # DW_AT_str_offsets_base
+        .byte 0x17  # DW_FORM_sec_offset
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x00  # EOM(3)
+
+        .section .debug_info,"",@progbits
+# DWARF v5 CU header.
+        .long  CU1_5_end-CU1_5_version  # Length of Unit
+CU1_5_version:
+        .short 5               # DWARF version number
+        .byte 1                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+# A compile-unit DIE, which has no attributes.
+        .byte 1                # Abbreviation code
+        .long .debug_str_offsets_base0
+CU1_5_end:
+
+# DWARF v5 CU header.
+        .long  CU2_5_end-CU2_5_version  # Length of Unit
+CU2_5_version:
+        .short 5               # DWARF version number
+        .byte 1                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+# A compile-unit DIE, which has no attributes.
+        .byte 1                # Abbreviation code
+        .long .debug_str_offsets_base1
+CU2_5_end:
+
+        .section .debug_str_offsets,"",@progbits
+# CU1's contribution
+        .long .debug_str_offsets_segment1_end-.debug_str_offsets_base0
+        .short 5    # DWARF version
+        .short 0    # Padding
+.debug_str_offsets_base0:
+        .long str_producer
+        .long str_CU1
+        .long str_CU1_dir
+.debug_str_offsets_segment0_end:
+# CU2's contribution
+# Overlapping with CU1's contribution
+        .long .debug_str_offsets_segment1_end-.debug_str_offsets_base1
+        .short 5    # DWARF version
+        .short 0    # Padding
+.debug_str_offsets_base1:
+        .long str_producer
+        .long str_CU2
+        .long str_CU2_dir
+.debug_str_offsets_segment1_end:
+
+# OVERLAP:            .debug_str_offsets contents:
+# OVERLAP-NOT:        contents:
+# OVERLAP:            error: overlapping contributions to string offsets table in section .debug_str_offsets.
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s b/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s
index 34f151d37d95..f8f48ea13882 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s
@@ -43,14 +43,17 @@ Ldebug_str_offsets_base0:
         .long str_Variable2
         .long str_Variable3
 Ldebug_str_offsets_segment0_end:
-# CU2's contribution
-        .long Ldebug_str_offsets_segment1_end-Ldebug_str_offsets_base1
+# A 4-byte gap.
+        .long 0
+# CU2's contribution (DWARF64 format)
+        .long 0xffffffff
+        .quad Ldebug_str_offsets_segment1_end-Ldebug_str_offsets_base1
         .short 5    # DWARF version
         .short 0    # Padding
 Ldebug_str_offsets_base1:
-        .long str_producer
-        .long str_CU2
-        .long str_CU2_dir
+        .quad str_producer
+        .quad str_CU2
+        .quad str_CU2_dir
 Ldebug_str_offsets_segment1_end:
 # The TU's contribution
         .long Ldebug_str_offsets_segment2_end-Ldebug_str_offsets_base2
@@ -234,20 +237,20 @@ TU_5_end:
 # COMMON:      DW_TAG_compile_unit
 # COMMON-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
 # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2")
-# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000002c)
+# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000038)
 # COMMON-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2")
 # 
 # The type unit
 # COMMON:      .debug_types contents:
 # COMMON:      DW_TAG_type_unit
 # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit")
-# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset]       (0x00000040)
+# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset]       (0x00000058)
 # COMMON:      DW_TAG_structure_type
 # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct")
 # 
 # The .debug_str_offsets section
 # COMMON:      .debug_str_offsets contents:
-# COMMON-NEXT: 0x00000000: Contribution size = 28, Version = 5
+# COMMON-NEXT: 0x00000000: Contribution size = 28, Format = DWARF32, Version = 5
 # COMMON-NEXT: 0x00000008: 00000000 "Handmade DWARF producer"
 # COMMON-NEXT: 0x0000000c: 00000018 "Compile_Unit_1"
 # COMMON-NEXT: 0x00000010: 00000027 "/home/test/CU1"
@@ -255,10 +258,11 @@ TU_5_end:
 # COMMON-NEXT: 0x00000018: 0000006e "MyVar1"
 # COMMON-NEXT: 0x0000001c: 00000075 "MyVar2"
 # COMMON-NEXT: 0x00000020: 0000007c "MyVar3"
-# COMMON-NEXT: 0x00000024: Contribution size = 12, Version = 5
-# COMMON-NEXT: 0x0000002c: 00000000 "Handmade DWARF producer"
-# COMMON-NEXT: 0x00000030: 00000036 "Compile_Unit_2"
-# COMMON-NEXT: 0x00000034: 00000045 "/home/test/CU2"
-# COMMON-NEXT: 0x00000038: Contribution size = 8, Version = 5
-# COMMON-NEXT: 0x00000040: 00000054 "Type_Unit"
-# COMMON-NEXT: 0x00000044: 0000005e "MyStruct"
+# COMMON-NEXT: 0x00000024: Gap, length = 4
+# COMMON-NEXT: 0x00000028: Contribution size = 24, Format = DWARF64, Version = 5
+# COMMON-NEXT: 0x00000038: 00000000 "Handmade DWARF producer"
+# COMMON-NEXT: 0x00000040: 00000036 "Compile_Unit_2"
+# COMMON-NEXT: 0x00000048: 00000045 "/home/test/CU2"
+# COMMON-NEXT: 0x00000050: Contribution size = 8, Format = DWARF32, Version = 5
+# COMMON-NEXT: 0x00000058: 00000054 "Type_Unit"
+# COMMON-NEXT: 0x0000005c: 0000005e "MyStruct"
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets.s b/test/DebugInfo/X86/dwarfdump-str-offsets.s
index 5fbef186af60..363775a59f50 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets.s
@@ -44,14 +44,17 @@ str_Variable3:
         .long str_Variable2
         .long str_Variable3
 .debug_str_offsets_segment0_end:
-# CU2's contribution
-        .long .debug_str_offsets_segment1_end-.debug_str_offsets_base1
+# A 4-byte gap.
+        .long 0
+# CU2's contribution in DWARF64 format
+        .long 0xffffffff
+        .quad .debug_str_offsets_segment1_end-.debug_str_offsets_base1
         .short 5    # DWARF version
         .short 0    # Padding
 .debug_str_offsets_base1:
-        .long str_producer
-        .long str_CU2
-        .long str_CU2_dir
+        .quad str_producer
+        .quad str_CU2
+        .quad str_CU2_dir
 .debug_str_offsets_segment1_end:
 # The TU's contribution
         .long .debug_str_offsets_segment2_end-.debug_str_offsets_base2
@@ -75,7 +78,7 @@ dwo_str_TU_5_type:
         .asciz "V5_split_Mystruct"
 
         .section .debug_str_offsets.dwo,"",@progbits
-# The split CU's contribution
+# One contribution only in a .dwo file
         .long .debug_dwo_str_offsets_segment0_end-.debug_dwo_str_offsets_base0
         .short 5    # DWARF version
         .short 0    # Padding
@@ -83,15 +86,9 @@ dwo_str_TU_5_type:
         .long dwo_str_CU_5_producer-.debug_str.dwo
         .long dwo_str_CU_5_name-.debug_str.dwo
         .long dwo_str_CU_5_comp_dir-.debug_str.dwo
-.debug_dwo_str_offsets_segment0_end:
-# The split TU's contribution
-        .long .debug_dwo_str_offsets_segment1_end-.debug_dwo_str_offsets_base1
-        .short 5    # DWARF version
-        .short 0    # Padding
-.debug_dwo_str_offsets_base1:
         .long dwo_str_TU_5-.debug_str.dwo
         .long dwo_str_TU_5_type-.debug_str.dwo
-.debug_dwo_str_offsets_segment1_end:
+.debug_dwo_str_offsets_segment0_end:
 
 # All CUs/TUs use the same abbrev section for simplicity.
         .section .debug_abbrev,"",@progbits
@@ -163,8 +160,6 @@ dwo_str_TU_5_type:
         .byte 0x1a  # DW_FORM_strx
         .byte 0x03  # DW_AT_name
         .byte 0x1a  # DW_FORM_strx
-        .byte 0x72  # DW_AT_str_offsets_base
-        .byte 0x17  # DW_FORM_sec_offset
         .byte 0x1b  # DW_AT_comp_dir
         .byte 0x1a  # DW_FORM_strx
         .byte 0x00  # EOM(1)
@@ -174,8 +169,6 @@ dwo_str_TU_5_type:
         .byte 0x01  # DW_CHILDREN_yes
         .byte 0x03  # DW_AT_name
         .byte 0x1a  # DW_FORM_strx
-        .byte 0x72  # DW_AT_str_offsets_base
-        .byte 0x17  # DW_FORM_sec_offset
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x03  # Abbrev code
@@ -275,7 +268,6 @@ CU_split_5_version:
         .byte 1                # Abbreviation code
         .byte 0                # The index of the producer string
         .byte 1                # The index of the CU name string
-        .long .debug_dwo_str_offsets_base0-.debug_str_offsets.dwo
         .byte 2                # The index of the comp dir string
         .byte 0 # NULL
 CU_split_5_end:
@@ -294,12 +286,11 @@ TU_split_5_version:
         .long TU_split_5_type-TU_split_5_start  # Type offset
 # The type-unit DIE, which has a name.
         .byte 2                # Abbreviation code
-        .byte 0                # The index of the type unit name string
-        .long .debug_dwo_str_offsets_base1-.debug_str_offsets.dwo 
+        .byte 3                # The index of the type unit name string
 # The type DIE, which has a name.
 TU_split_5_type:
         .byte 3                # Abbreviation code
-        .byte 1                # The index of the type name string
+        .byte 4                # The index of the type name string
         .byte 0 # NULL
         .byte 0 # NULL
 TU_split_5_end:
@@ -340,7 +331,7 @@ TU_split_5_end:
 # COMMON:      DW_TAG_compile_unit
 # COMMON-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer")
 # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2")
-# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000002c)
+# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000038)
 # COMMON-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2")
 # 
 # The split CU
@@ -349,28 +340,25 @@ TU_split_5_end:
 # SPLIT:       DW_TAG_compile_unit
 # SPLIT-NEXT:  DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade split DWARF producer")
 # SPLIT-NEXT:  DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_compile_unit")
-# SPLIT-NEXT:  DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
 # SPLIT-NEXT:  DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/splitCU")
 # 
 # The type unit
 # COMMON:      .debug_types contents:
 # COMMON:      DW_TAG_type_unit
 # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit")
-# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset]       (0x00000040)
 # COMMON:      DW_TAG_structure_type
 # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct")
 # 
 # The split type unit
 # SPLIT:       .debug_types.dwo contents:
 # SPLIT:       DW_TAG_type_unit
-# SPLIT-NEXT:  DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "V5_split_type_unit")
-# SPLIT-NEXT:  DW_AT_str_offsets_base [DW_FORM_sec_offset]       (0x0000001c)
+# SPLIT-NEXT:  DW_AT_name [DW_FORM_strx] ( indexed (00000003) string = "V5_split_type_unit")
 # SPLIT:       DW_TAG_structure_type
-# SPLIT-NEXT:  DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_Mystruct")
+# SPLIT-NEXT:  DW_AT_name [DW_FORM_strx] ( indexed (00000004) string = "V5_split_Mystruct")
 # 
 # The .debug_str_offsets section
 # COMMON:      .debug_str_offsets contents:
-# COMMON-NEXT: 0x00000000: Contribution size = 28, Version = 5
+# COMMON-NEXT: 0x00000000: Contribution size = 28, Format = DWARF32, Version = 5
 # COMMON-NEXT: 0x00000008: 00000000 "Handmade DWARF producer"
 # COMMON-NEXT: 0x0000000c: 00000018 "Compile_Unit_1"
 # COMMON-NEXT: 0x00000010: 00000027 "/home/test/CU1"
@@ -378,19 +366,19 @@ TU_split_5_end:
 # COMMON-NEXT: 0x00000018: 0000006e "MyVar1"
 # COMMON-NEXT: 0x0000001c: 00000075 "MyVar2"
 # COMMON-NEXT: 0x00000020: 0000007c "MyVar3"
-# COMMON-NEXT: 0x00000024: Contribution size = 12, Version = 5
-# COMMON-NEXT: 0x0000002c: 00000000 "Handmade DWARF producer"
-# COMMON-NEXT: 0x00000030: 00000036 "Compile_Unit_2"
-# COMMON-NEXT: 0x00000034: 00000045 "/home/test/CU2"
-# COMMON-NEXT: 0x00000038: Contribution size = 8, Version = 5
-# COMMON-NEXT: 0x00000040: 00000054 "Type_Unit"
-# COMMON-NEXT: 0x00000044: 0000005e "MyStruct"
+# COMMON-NEXT: Gap, length = 4
+# COMMON-NEXT: 0x00000028: Contribution size = 24, Format = DWARF64, Version = 5
+# COMMON-NEXT: 0x00000038: 00000000 "Handmade DWARF producer"
+# COMMON-NEXT: 0x00000040: 00000036 "Compile_Unit_2"
+# COMMON-NEXT: 0x00000048: 00000045 "/home/test/CU2"
+# COMMON-NEXT: 0x00000050: Contribution size = 8, Format = DWARF32, Version = 5
+# COMMON-NEXT: 0x00000058: 00000054 "Type_Unit"
+# COMMON-NEXT: 0x0000005c: 0000005e "MyStruct"
 # 
 # SPLIT:       .debug_str_offsets.dwo contents:
-# SPLIT-NEXT:  0x00000000: Contribution size = 12, Version = 5
+# SPLIT-NEXT:  0x00000000: Contribution size = 20, Format = DWARF32, Version = 5
 # SPLIT-NEXT:  0x00000008: 00000000 "Handmade split DWARF producer"
 # SPLIT-NEXT:  0x0000000c: 0000001e "V5_split_compile_unit"
 # SPLIT-NEXT:  0x00000010: 00000034 "/home/test/splitCU"
-# SPLIT-NEXT:  0x00000014: Contribution size = 8, Version = 5
-# SPLIT-NEXT:  0x0000001c: 00000047 "V5_split_type_unit"
-# SPLIT-NEXT:  0x00000020: 0000005a "V5_split_Mystruct"
+# SPLIT-NEXT:  0x00000014: 00000047 "V5_split_type_unit"
+# SPLIT-NEXT:  0x00000018: 0000005a "V5_split_Mystruct"
diff --git a/test/Instrumentation/HWAddressSanitizer/basic.ll b/test/Instrumentation/HWAddressSanitizer/basic.ll
index db1f52fc56e9..a5e49062669e 100644
--- a/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ b/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -1,6 +1,7 @@
 ; Test basic address sanitizer instrumentation.
 ;
-; RUN: opt < %s -hwasan -S | FileCheck %s
+; RUN: opt < %s -hwasan -hwasan-recover=0 -S | FileCheck %s  --check-prefixes=CHECK,ABORT
+; RUN: opt < %s -hwasan -hwasan-recover=1 -S | FileCheck %s  --check-prefixes=CHECK,RECOVER
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android"
@@ -17,8 +18,10 @@ define i8 @test_load8(i8* %a) sanitize_hwaddress {
 ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
 ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
 
-; CHECK: call void asm sideeffect "hlt #256", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #256", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #288", "{x0}"(i64 %[[A]])
+; RECOVER: br label
 
 ; CHECK: %[[G:[^ ]*]] = load i8, i8* %a, align 4
 ; CHECK: ret i8 %[[G]]
@@ -40,8 +43,10 @@ define i16 @test_load16(i16* %a) sanitize_hwaddress {
 ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
 ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
 
-; CHECK: call void asm sideeffect "hlt #257", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #257", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #289", "{x0}"(i64 %[[A]])
+; RECOVER: br label
 
 ; CHECK: %[[G:[^ ]*]] = load i16, i16* %a, align 4
 ; CHECK: ret i16 %[[G]]
@@ -63,8 +68,10 @@ define i32 @test_load32(i32* %a) sanitize_hwaddress {
 ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
 ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
 
-; CHECK: call void asm sideeffect "hlt #258", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #258", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #290", "{x0}"(i64 %[[A]])
+; RECOVER: br label
 
 ; CHECK: %[[G:[^ ]*]] = load i32, i32* %a, align 4
 ; CHECK: ret i32 %[[G]]
@@ -86,8 +93,10 @@ define i64 @test_load64(i64* %a) sanitize_hwaddress {
 ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
 ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
 
-; CHECK: call void asm sideeffect "hlt #259", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #259", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #291", "{x0}"(i64 %[[A]])
+; RECOVER: br label
 
 ; CHECK: %[[G:[^ ]*]] = load i64, i64* %a, align 8
 ; CHECK: ret i64 %[[G]]
@@ -109,8 +118,10 @@ define i128 @test_load128(i128* %a) sanitize_hwaddress {
 ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
 ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
 
-; CHECK: call void asm sideeffect "hlt #260", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #260", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #292", "{x0}"(i64 %[[A]])
+; RECOVER: br label
 
 ; CHECK: %[[G:[^ ]*]] = load i128, i128* %a, align 16
 ; CHECK: ret i128 %[[G]]
@@ -123,7 +134,8 @@ entry:
 define i40 @test_load40(i40* %a) sanitize_hwaddress {
 ; CHECK-LABEL: @test_load40(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64
-; CHECK: call void @__hwasan_load(i64 %[[A]], i64 5)
+; ABORT: call void @__hwasan_load(i64 %[[A]], i64 5)
+; RECOVER: call void @__hwasan_load_noabort(i64 %[[A]], i64 5)
 ; CHECK: %[[B:[^ ]*]] = load i40, i40* %a
 ; CHECK: ret i40 %[[B]]
 
@@ -144,8 +156,10 @@ define void @test_store8(i8* %a, i8 %b) sanitize_hwaddress {
 ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
 ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
 
-; CHECK: call void asm sideeffect "hlt #272", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #272", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #304", "{x0}"(i64 %[[A]])
+; RECOVER: br label
 
 ; CHECK: store i8 %b, i8* %a, align 4
 ; CHECK: ret void
@@ -167,8 +181,10 @@ define void @test_store16(i16* %a, i16 %b) sanitize_hwaddress {
 ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
 ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
 
-; CHECK: call void asm sideeffect "hlt #273", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #273", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #305", "{x0}"(i64 %[[A]])
+; RECOVER: br label
 
 ; CHECK: store i16 %b, i16* %a, align 4
 ; CHECK: ret void
@@ -190,8 +206,10 @@ define void @test_store32(i32* %a, i32 %b) sanitize_hwaddress {
 ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
 ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
 
-; CHECK: call void asm sideeffect "hlt #274", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #274", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #306", "{x0}"(i64 %[[A]])
+; RECOVER: br label
 
 ; CHECK: store i32 %b, i32* %a, align 4
 ; CHECK: ret void
@@ -213,8 +231,10 @@ define void @test_store64(i64* %a, i64 %b) sanitize_hwaddress {
 ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
 ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
 
-; CHECK: call void asm sideeffect "hlt #275", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #275", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #307", "{x0}"(i64 %[[A]])
+; RECOVER: br label
 
 ; CHECK: store i64 %b, i64* %a, align 8
 ; CHECK: ret void
@@ -236,8 +256,10 @@ define void @test_store128(i128* %a, i128 %b) sanitize_hwaddress {
 ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
 ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
 
-; CHECK: call void asm sideeffect "hlt #276", "{x0}"(i64 %[[A]])
-; CHECK: br label
+; ABORT: call void asm sideeffect "hlt #276", "{x0}"(i64 %[[A]])
+; ABORT: unreachable
+; RECOVER: call void asm sideeffect "hlt #308", "{x0}"(i64 %[[A]])
+; RECOVER: br label
 
 ; CHECK: store i128 %b, i128* %a, align 16
 ; CHECK: ret void
@@ -250,7 +272,8 @@ entry:
 define void @test_store40(i40* %a, i40 %b) sanitize_hwaddress {
 ; CHECK-LABEL: @test_store40(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64
-; CHECK: call void @__hwasan_store(i64 %[[A]], i64 5)
+; ABORT: call void @__hwasan_store(i64 %[[A]], i64 5)
+; RECOVER: call void @__hwasan_store_noabort(i64 %[[A]], i64 5)
 ; CHECK: store i40 %b, i40* %a
 ; CHECK: ret void
 
@@ -262,7 +285,8 @@ entry:
 define void @test_store_unaligned(i64* %a, i64 %b) sanitize_hwaddress {
 ; CHECK-LABEL: @test_store_unaligned(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i64* %a to i64
-; CHECK: call void @__hwasan_store(i64 %[[A]], i64 8)
+; ABORT: call void @__hwasan_store(i64 %[[A]], i64 8)
+; RECOVER: call void @__hwasan_store_noabort(i64 %[[A]], i64 8)
 ; CHECK: store i64 %b, i64* %a, align 4
 ; CHECK: ret void
 
diff --git a/test/Instrumentation/HWAddressSanitizer/with-calls.ll b/test/Instrumentation/HWAddressSanitizer/with-calls.ll
index be3b33d4f8d6..0a1713b83912 100644
--- a/test/Instrumentation/HWAddressSanitizer/with-calls.ll
+++ b/test/Instrumentation/HWAddressSanitizer/with-calls.ll
@@ -1,6 +1,7 @@
 ; Test basic address sanitizer instrumentation.
 ;
-; RUN: opt < %s -hwasan -hwasan-instrument-with-calls -S | FileCheck %s
+; RUN: opt < %s -hwasan -hwasan-instrument-with-calls -S | FileCheck %s --check-prefixes=CHECK,ABORT
+; RUN: opt < %s -hwasan -hwasan-instrument-with-calls -hwasan-recover=1 -S | FileCheck %s --check-prefixes=CHECK,RECOVER
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android"
@@ -8,7 +9,8 @@ target triple = "aarch64--linux-android"
 define i8 @test_load8(i8* %a) sanitize_hwaddress {
 ; CHECK-LABEL: @test_load8(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i8* %a to i64
-; CHECK: call void @__hwasan_load1(i64 %[[A]])
+; ABORT: call void @__hwasan_load1(i64 %[[A]])
+; RECOVER: call void @__hwasan_load1_noabort(i64 %[[A]])
 ; CHECK: %[[B:[^ ]*]] = load i8, i8* %a
 ; CHECK: ret i8 %[[B]]
 
@@ -20,7 +22,8 @@ entry:
 define i16 @test_load16(i16* %a) sanitize_hwaddress {
 ; CHECK-LABEL: @test_load16(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i16* %a to i64
-; CHECK: call void @__hwasan_load2(i64 %[[A]])
+; ABORT: call void @__hwasan_load2(i64 %[[A]])
+; RECOVER: call void @__hwasan_load2_noabort(i64 %[[A]])
 ; CHECK: %[[B:[^ ]*]] = load i16, i16* %a
 ; CHECK: ret i16 %[[B]]
 
@@ -32,7 +35,8 @@ entry:
 define i32 @test_load32(i32* %a) sanitize_hwaddress {
 ; CHECK-LABEL: @test_load32(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i32* %a to i64
-; CHECK: call void @__hwasan_load4(i64 %[[A]])
+; ABORT: call void @__hwasan_load4(i64 %[[A]])
+; RECOVER: call void @__hwasan_load4_noabort(i64 %[[A]])
 ; CHECK: %[[B:[^ ]*]] = load i32, i32* %a
 ; CHECK: ret i32 %[[B]]
 
@@ -44,7 +48,8 @@ entry:
 define i64 @test_load64(i64* %a) sanitize_hwaddress {
 ; CHECK-LABEL: @test_load64(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i64* %a to i64
-; CHECK: call void @__hwasan_load8(i64 %[[A]])
+; ABORT: call void @__hwasan_load8(i64 %[[A]])
+; RECOVER: call void @__hwasan_load8_noabort(i64 %[[A]])
 ; CHECK: %[[B:[^ ]*]] = load i64, i64* %a
 ; CHECK: ret i64 %[[B]]
 
@@ -56,7 +61,8 @@ entry:
 define i128 @test_load128(i128* %a) sanitize_hwaddress {
 ; CHECK-LABEL: @test_load128(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i128* %a to i64
-; CHECK: call void @__hwasan_load16(i64 %[[A]])
+; ABORT: call void @__hwasan_load16(i64 %[[A]])
+; RECOVER: call void @__hwasan_load16_noabort(i64 %[[A]])
 ; CHECK: %[[B:[^ ]*]] = load i128, i128* %a
 ; CHECK: ret i128 %[[B]]
 
@@ -68,7 +74,8 @@ entry:
 define i40 @test_load40(i40* %a) sanitize_hwaddress {
 ; CHECK-LABEL: @test_load40(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64
-; CHECK: call void @__hwasan_load(i64 %[[A]], i64 5)
+; ABORT: call void @__hwasan_load(i64 %[[A]], i64 5)
+; RECOVER: call void @__hwasan_load_noabort(i64 %[[A]], i64 5)
 ; CHECK: %[[B:[^ ]*]] = load i40, i40* %a
 ; CHECK: ret i40 %[[B]]
 
@@ -80,7 +87,8 @@ entry:
 define void @test_store8(i8* %a, i8 %b) sanitize_hwaddress {
 ; CHECK-LABEL: @test_store8(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i8* %a to i64
-; CHECK: call void @__hwasan_store1(i64 %[[A]])
+; ABORT: call void @__hwasan_store1(i64 %[[A]])
+; RECOVER: call void @__hwasan_store1_noabort(i64 %[[A]])
 ; CHECK: store i8 %b, i8* %a
 ; CHECK: ret void
 
@@ -92,7 +100,8 @@ entry:
 define void @test_store16(i16* %a, i16 %b) sanitize_hwaddress {
 ; CHECK-LABEL: @test_store16(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i16* %a to i64
-; CHECK: call void @__hwasan_store2(i64 %[[A]])
+; ABORT: call void @__hwasan_store2(i64 %[[A]])
+; RECOVER: call void @__hwasan_store2_noabort(i64 %[[A]])
 ; CHECK: store i16 %b, i16* %a
 ; CHECK: ret void
 
@@ -104,7 +113,8 @@ entry:
 define void @test_store32(i32* %a, i32 %b) sanitize_hwaddress {
 ; CHECK-LABEL: @test_store32(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i32* %a to i64
-; CHECK: call void @__hwasan_store4(i64 %[[A]])
+; ABORT: call void @__hwasan_store4(i64 %[[A]])
+; RECOVER: call void @__hwasan_store4_noabort(i64 %[[A]])
 ; CHECK: store i32 %b, i32* %a
 ; CHECK: ret void
 
@@ -116,7 +126,8 @@ entry:
 define void @test_store64(i64* %a, i64 %b) sanitize_hwaddress {
 ; CHECK-LABEL: @test_store64(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i64* %a to i64
-; CHECK: call void @__hwasan_store8(i64 %[[A]])
+; ABORT: call void @__hwasan_store8(i64 %[[A]])
+; RECOVER: call void @__hwasan_store8_noabort(i64 %[[A]])
 ; CHECK: store i64 %b, i64* %a
 ; CHECK: ret void
 
@@ -128,7 +139,8 @@ entry:
 define void @test_store128(i128* %a, i128 %b) sanitize_hwaddress {
 ; CHECK-LABEL: @test_store128(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i128* %a to i64
-; CHECK: call void @__hwasan_store16(i64 %[[A]])
+; ABORT: call void @__hwasan_store16(i64 %[[A]])
+; RECOVER: call void @__hwasan_store16_noabort(i64 %[[A]])
 ; CHECK: store i128 %b, i128* %a
 ; CHECK: ret void
 
@@ -140,7 +152,8 @@ entry:
 define void @test_store40(i40* %a, i40 %b) sanitize_hwaddress {
 ; CHECK-LABEL: @test_store40(
 ; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64
-; CHECK: call void @__hwasan_store(i64 %[[A]], i64 5)
+; ABORT: call void @__hwasan_store(i64 %[[A]], i64 5)
+; RECOVER: call void @__hwasan_store_noabort(i64 %[[A]], i64 5)
 ; CHECK: store i40 %b, i40* %a
 ; CHECK: ret void
 
diff --git a/test/MC/AArch64/arm64-system-encoding.s b/test/MC/AArch64/arm64-system-encoding.s
index ef4037b7bf3f..19ed248db3a8 100644
--- a/test/MC/AArch64/arm64-system-encoding.s
+++ b/test/MC/AArch64/arm64-system-encoding.s
@@ -1,4 +1,5 @@
 ; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
+; RUN: not llvm-mc -triple arm64-apple-darwin -mattr=+v8.3a -show-encoding < %s 2> %t | FileCheck %s --check-prefix=CHECK-V83
 ; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
 
 foo:
@@ -233,6 +234,7 @@ foo:
   mrs x3, AMAIR_EL3
   mrs x3, CCSIDR_EL1
   mrs x3, CLIDR_EL1
+  mrs x3, CCSIDR2_EL1
   mrs x3, CNTFRQ_EL0
   mrs x3, CNTHCTL_EL2
   mrs x3, CNTHP_CTL_EL2
@@ -418,6 +420,7 @@ foo:
 ; CHECK: mrs x3, AMAIR_EL3              ; encoding: [0x03,0xa3,0x3e,0xd5]
 ; CHECK: mrs x3, CCSIDR_EL1             ; encoding: [0x03,0x00,0x39,0xd5]
 ; CHECK: mrs x3, CLIDR_EL1              ; encoding: [0x23,0x00,0x39,0xd5]
+; CHECK-V83: mrs x3, CCSIDR2_EL1        ; encoding: [0x43,0x00,0x39,0xd5]
 ; CHECK: mrs x3, CNTFRQ_EL0             ; encoding: [0x03,0xe0,0x3b,0xd5]
 ; CHECK: mrs x3, CNTHCTL_EL2            ; encoding: [0x03,0xe1,0x3c,0xd5]
 ; CHECK: mrs x3, CNTHP_CTL_EL2          ; encoding: [0x23,0xe2,0x3c,0xd5]
diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s
index 597adfb3a25b..3b791bef0b6f 100644
--- a/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/test/MC/AArch64/basic-a64-diagnostics.s
@@ -3504,6 +3504,7 @@
         msr MIDR_EL1, x12
         msr CCSIDR_EL1, x12
         msr CLIDR_EL1, x12
+        msr CCSIDR2_EL1, x12
         msr CTR_EL0, x12
         msr MPIDR_EL1, x12
         msr REVIDR_EL1, x12
@@ -3572,6 +3573,9 @@
 // CHECK-ERROR-NEXT:         msr CLIDR_EL1, x12
 // CHECK-ERROR-NEXT:             ^
 // CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr CCSIDR2_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
 // CHECK-ERROR-NEXT:         msr CTR_EL0, x12
 // CHECK-ERROR-NEXT:             ^
 // CHECK-ERROR-NEXT: error: expected writable system register or pstate
diff --git a/test/MC/AArch64/dot-req.s b/test/MC/AArch64/dot-req.s
index a557f0c67589..582674b1f8d9 100644
--- a/test/MC/AArch64/dot-req.s
+++ b/test/MC/AArch64/dot-req.s
@@ -37,3 +37,13 @@ bar:
 // CHECK: fmov    d2, d3                  // encoding: [0x62,0x40,0x60,0x1e]
 // CHECK: ldr      q2, [sp]               // encoding: [0xe2,0x03,0xc0,0x3d]
 // CHECK: mov             v0.8b, v1.8b    // encoding: [0x20,0x1c,0xa1,0x0e]
+
+	peter .req x6
+	add peter, x0, x0
+	.unreq peter
+// CHECK: add x6, x0, x0
+
+	zoe .req x6
+	add zoe, x0, x0
+	.unreq zoe
+// CHECK: add x6, x0, x0
diff --git a/test/MC/AMDGPU/ds.s b/test/MC/AMDGPU/ds.s
index ef36a98f746a..b06101a4051b 100644
--- a/test/MC/AMDGPU/ds.s
+++ b/test/MC/AMDGPU/ds.s
@@ -511,6 +511,10 @@ ds_swizzle_b32 v8, v2
 // SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08]
 // VI:   ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
 
+ds_swizzle_b32 v8, v2 gds
+// SICI: ds_swizzle_b32 v8, v2 gds ; encoding: [0x00,0x00,0xd6,0xd8,0x02,0x00,0x00,0x08]
+// VI:   ds_swizzle_b32 v8, v2 gds ; encoding: [0x00,0x00,0x7b,0xd8,0x02,0x00,0x00,0x08]
+
 ds_swizzle_b32 v8, v2 offset:0xFFFF
 // SICI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x02,0x00,0x00,0x08]
 // VI:   ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x02,0x00,0x00,0x08]
diff --git a/test/MC/AMDGPU/expressions.s b/test/MC/AMDGPU/expressions.s
index 7b0e90378a06..dd4957c8baec 100644
--- a/test/MC/AMDGPU/expressions.s
+++ b/test/MC/AMDGPU/expressions.s
@@ -46,3 +46,11 @@ BB2:
 s_sub_u32 vcc_lo, vcc_lo, (BB2+4)-BB1
 // VI: s_sub_u32 vcc_lo, vcc_lo, (BB2+4)-BB1 ; encoding: [0x6a,0xff,0xea,0x80,A,A,A,A]
 // VI-NEXT: ;   fixup A - offset: 4, value: (BB2+4)-BB1, kind: FK_Data_4
+
+t=1
+s_sub_u32 s0, s0, -t
+// VI: s_sub_u32 s0, s0, -1            ; encoding: [0x00,0xc1,0x80,0x80]
+
+t=-1
+s_sub_u32 s0, s0, -t
+// VI: s_sub_u32 s0, s0, 1             ; encoding: [0x00,0x81,0x80,0x80]
diff --git a/test/MC/AMDGPU/invalid-instructions-spellcheck.s b/test/MC/AMDGPU/invalid-instructions-spellcheck.s
new file mode 100644
index 000000000000..f4198f10f4b8
--- /dev/null
+++ b/test/MC/AMDGPU/invalid-instructions-spellcheck.s
@@ -0,0 +1,48 @@
+# RUN: not llvm-mc -triple amdgcn < %s 2>&1 | FileCheck %s
+
+# This tests the mnemonic spell checker.
+
+# First check what happens when an instruction is omitted:
+
+v2, v4, v6
+
+# CHECK:      unknown token in expression
+# CHECK-NEXT: v2, v4, v6
+# CHECK-NEXT:   ^
+
+# CHECK:      error: not a valid operand.
+# CHECK-NEXT: v2, v4, v6
+# CHECK-NEXT:     ^
+
+# We don't want to see a suggestion here; the edit distance is too large to
+# give sensible suggestions:
+
+aaaaaaaaaaaaaaa v1, v2, v3
+
+# CHECK:      error: invalid instruction
+# CHECK-NEXT: aaaaaaaaaaaaaaa v1, v2, v3
+# CHECK-NEXT: ^
+
+# Check that we get one suggestion: 'dsc_write_src2_b64' is 1 edit away, i.e. an deletion.
+
+dsc_write_src2_b64 v1, v2, v3
+
+# CHECK:      error: invalid instruction, did you mean: ds_write_src2_b64?
+# CHECK-NEXT: dsc_write_src2_b64 v1, v2, v3
+# CHECK-NEXT: ^
+
+# Check edit distance 1 and 2, just insertions:
+
+s_mov_b v1, v2
+
+# CHECK:      error: invalid instruction, did you mean: s_mov_b32, s_mov_b64?
+# CHECK-NEXT: s_mov_b v1, v2
+# CHECK-NEXT: ^
+
+# Check an instruction that is 2 edits away, and also has a lot of candidates:
+
+s_load_dwordx v1, v2, v3
+
+# CHECK:      error: invalid instruction, did you mean: s_load_dword, s_load_dwordx16, s_load_dwordx2, s_load_dwordx4, s_load_dwordx8?
+# CHECK-NEXT: s_load_dwordx v1, v2, v3
+# CHECK-NEXT: ^
diff --git a/test/MC/AMDGPU/trap.s b/test/MC/AMDGPU/trap.s
index de551ca00b52..7b527ba3072e 100644
--- a/test/MC/AMDGPU/trap.s
+++ b/test/MC/AMDGPU/trap.s
@@ -190,6 +190,48 @@ s_mov_b64     ttmp[14:15], exec
 // GFX9: s_mov_b64 ttmp[14:15], exec       ; encoding: [0x7e,0x01,0xfa,0xbe]
 
 //===----------------------------------------------------------------------===//
+// Trap Handler related - 8-dword registers
+// NB: gfx7 doc states that SMRD does not support trap registers for dst
+//===----------------------------------------------------------------------===//
+
+s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0
+// VI:   [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00]
+// GFX9: [0x00,0x1b,0x2c,0xc0,0x00,0x00,0x00,0x00]
+
+s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0
+// VI:   [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00]
+// GFX9: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00]
+
+s_buffer_load_dwordx8 ttmp[8:15], s[0:3], s0
+// NOSICIVI: error: not a valid operand
+// GFX9: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00]
+
+s_load_dwordx8 ttmp[0:7], s[0:1], s0
+// VI:   [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00]
+// GFX9: [0x00,0x1b,0x0c,0xc0,0x00,0x00,0x00,0x00]
+
+s_load_dwordx8 ttmp[4:11], s[0:1], s0
+// VI:   [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00]
+// GFX9: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00]
+
+s_load_dwordx8 ttmp[8:15], s[0:1], s0
+// NOSICIVI: error: not a valid operand
+// GFX9: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00]
+
+//===----------------------------------------------------------------------===//
+// Trap Handler related - 16-dword registers
+// NB: gfx7 doc states that SMRD does not support trap registers for dst
+//===----------------------------------------------------------------------===//
+
+s_buffer_load_dwordx16 ttmp[0:15], s[0:3], s0
+// NOSICIVI: error: not a valid operand
+// GFX9: [0x00,0x1b,0x30,0xc0,0x00,0x00,0x00,0x00]
+
+s_load_dwordx16 ttmp[0:15], s[0:1], s0
+// NOSICIVI: error: not a valid operand
+// GFX9: [0x00,0x1b,0x10,0xc0,0x00,0x00,0x00,0x00]
+
+//===----------------------------------------------------------------------===//
 // Trap Handler related - Some specific instructions
 //===----------------------------------------------------------------------===//
 
diff --git a/test/MC/AMDGPU/vop1-gfx9-err.s b/test/MC/AMDGPU/vop1-gfx9-err.s
index 0036b79c71b5..61bf5f661759 100644
--- a/test/MC/AMDGPU/vop1-gfx9-err.s
+++ b/test/MC/AMDGPU/vop1-gfx9-err.s
@@ -1,6 +1,6 @@
-// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN,GFX9 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN,VI %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN,CI %s
 
 v_swap_b32 v1, 1
 // GCN: :16: error: invalid operand for instruction
@@ -10,7 +10,9 @@ v_swap_b32 v1, s0
 
 // FIXME: Better error for it requiring VOP1 encoding
 v_swap_b32_e64 v1, v2
-// GCN: :1: error: unrecognized instruction mnemonic
+// GFX9: :1: error: invalid instruction, did you mean: v_swap_b32?
+// CI: :1: error: invalid instruction
+// VI: :1: error: invalid instruction
 
 v_swap_b32 v1, v2, v1
 // GCN: :20: error: invalid operand for instruction
@@ -22,4 +24,4 @@ v_swap_b32 v1, v2, v2, v2
 // GCN: :20: error: invalid operand for instruction
 
 v_swap_codegen_pseudo_b32 v1, v2
-// GCN: :1: error: unrecognized instruction mnemonic
+// GCN: :1: error: invalid instruction
diff --git a/test/MC/AMDGPU/vop3p-err.s b/test/MC/AMDGPU/vop3p-err.s
index 7dfd951b6ad0..fe3fee97b5e7 100644
--- a/test/MC/AMDGPU/vop3p-err.s
+++ b/test/MC/AMDGPU/vop3p-err.s
@@ -59,16 +59,16 @@ v_pk_add_f16 v1, v2, |v3|
 // GFX9: :22: error: invalid operand for instruction
 v_pk_add_f16 v1, v2, abs(v3)
 
-// GFX9: :19: error: invalid operand for instruction
+// GFX9: :18: error: invalid operand for instruction
 v_pk_add_f16 v1, -v2, v3
 
-// GFX9: :23: error: invalid operand for instruction
+// GFX9: :22: error: invalid operand for instruction
 v_pk_add_f16 v1, v2, -v3
 
 // GFX9: :18: error: invalid operand for instruction
 v_pk_add_u16 v1, abs(v2), v3
 
-// GFX9: :19: error: invalid operand for instruction
+// GFX9: :18: error: invalid operand for instruction
 v_pk_add_u16 v1, -v2, v3
 
 //
diff --git a/test/MC/ARM/dfb-neg.s b/test/MC/ARM/dfb-neg.s
new file mode 100644
index 000000000000..15c44877fa6f
--- /dev/null
+++ b/test/MC/ARM/dfb-neg.s
@@ -0,0 +1,10 @@
+@ RUN: not llvm-mc -triple armv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s 2>&1 | FileCheck %s
+@ RUN: not llvm-mc -triple thumbv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s 2>&1 | FileCheck %s
+
+        dfb
+@ CHECK: error: instruction requires: full-data-barrier
+
+        dfb sy
+        dfb #0
+@ CHECK: error: invalid instruction
+@ CHECK: error: invalid instruction
diff --git a/test/MC/ARM/dfb.s b/test/MC/ARM/dfb.s
new file mode 100644
index 000000000000..58477749807f
--- /dev/null
+++ b/test/MC/ARM/dfb.s
@@ -0,0 +1,6 @@
+@ RUN: llvm-mc -triple armv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-ARM
+@ RUN: llvm-mc -triple thumbv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-THUMB
+
+        dfb
+@ CHECK-ARM:   dfb                             @ encoding: [0x4c,0xf0,0x7f,0xf5]
+@ CHECK-THUMB: dfb                             @ encoding: [0xbf,0xf3,0x4c,0x8f]
diff --git a/test/MC/COFF/align-nops.s b/test/MC/COFF/align-nops.s
index 02b488475e90..6d23721ed779 100644
--- a/test/MC/COFF/align-nops.s
+++ b/test/MC/COFF/align-nops.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | llvm-readobj -s -sd | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 -mcpu=pentiumpro %s | llvm-readobj -s -sd | FileCheck %s
 
 // Test that we get optimal nops in text
     .text
diff --git a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
index a2f9d24091ef..33b8d6b4de44 100644
--- a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
+++ b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
@@ -1,6 +1,7 @@
 # RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8,+fullfp16 -disassemble < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FP16
+# RUN: llvm-mc -triple=arm64 -mattr=+v8.3a -disassemble < %s | FileCheck %s --check-prefix=CHECK-V83
 
 #------------------------------------------------------------------------------
 # Add/sub (immediate)
@@ -3493,6 +3494,7 @@
 # CHECK: mrs      x9, {{midr_el1|MIDR_EL1}}
 # CHECK: mrs      x9, {{ccsidr_el1|CCSIDR_EL1}}
 # CHECK: mrs      x9, {{csselr_el1|CSSELR_EL1}}
+# CHECK-V83: mrs  x9, {{ccsidr2_el1|CCSIDR2_EL1}}
 # CHECK: mrs      x9, {{vpidr_el2|VPIDR_EL2}}
 # CHECK: mrs      x9, {{clidr_el1|CLIDR_EL1}}
 # CHECK: mrs      x9, {{ctr_el0|CTR_EL0}}
@@ -4048,6 +4050,7 @@
 0x9 0x0 0x38 0xd5
 0x9 0x0 0x39 0xd5
 0x9 0x0 0x3a 0xd5
+0x49 0x0 0x39 0xd5
 0x9 0x0 0x3c 0xd5
 0x29 0x0 0x39 0xd5
 0x29 0x0 0x3b 0xd5
diff --git a/test/MC/Disassembler/AMDGPU/ds_vi.txt b/test/MC/Disassembler/AMDGPU/ds_vi.txt
index 6d910ea5bb58..c12e7a157e82 100644
--- a/test/MC/Disassembler/AMDGPU/ds_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/ds_vi.txt
@@ -171,6 +171,9 @@
 # VI:   ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08]
 0x00 0x00 0x7a 0xd8 0x02 0x00 0x00 0x08
 
+# VI:   ds_swizzle_b32 v8, v2 gds ; encoding: [0x00,0x00,0x7b,0xd8,0x02,0x00,0x00,0x08]
+0x00 0x00 0x7b 0xd8 0x02 0x00 0x00 0x08
+
 # VI:   ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0x6c,0xd8,0x02,0x00,0x00,0x08]
 0x00 0x00 0x6c 0xd8 0x02 0x00 0x00 0x08
 
diff --git a/test/MC/Disassembler/AMDGPU/trap_gfx9.txt b/test/MC/Disassembler/AMDGPU/trap_gfx9.txt
index 70a570eec5ab..0b140c42168a 100644
--- a/test/MC/Disassembler/AMDGPU/trap_gfx9.txt
+++ b/test/MC/Disassembler/AMDGPU/trap_gfx9.txt
@@ -107,3 +107,35 @@
 
 # GFX9:   buffer_atomic_inc v1, off, ttmp[12:15], 56 glc ; encoding: [0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8]
 0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8
+
+#===----------------------------------------------------------------------===#
+# Trap Handler related - 8-dword registers
+#===----------------------------------------------------------------------===#
+
+# GFX9:   s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0 ; encoding: [0x00,0x1b,0x2c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1b,0x2c,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9:   s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0 ; encoding: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9:   s_buffer_load_dwordx8 ttmp[8:15], s[0:3], s0 ; encoding: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9:   s_load_dwordx8 ttmp[0:7], s[0:1], s0 ; encoding: [0x00,0x1b,0x0c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1b,0x0c,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9:   s_load_dwordx8 ttmp[4:11], s[0:1], s0 ; encoding: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9:   s_load_dwordx8 ttmp[8:15], s[0:1], s0 ; encoding: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00
+
+#===----------------------------------------------------------------------===#
+# Trap Handler related - 16-dword registers
+#===----------------------------------------------------------------------===#
+
+# GFX9:   s_buffer_load_dwordx16 ttmp[0:15], s[0:3], s0 ; encoding: [0x00,0x1b,0x30,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1b,0x30,0xc0,0x00,0x00,0x00,0x00
+
+# GFX9:   s_load_dwordx16 ttmp[0:15], s[0:1], s0 ; encoding: [0x00,0x1b,0x10,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1b,0x10,0xc0,0x00,0x00,0x00,0x00
diff --git a/test/MC/Disassembler/AMDGPU/trap_vi.txt b/test/MC/Disassembler/AMDGPU/trap_vi.txt
index 8b131512050c..eb254134cc53 100644
--- a/test/MC/Disassembler/AMDGPU/trap_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/trap_vi.txt
@@ -107,3 +107,19 @@
 
 # VI:   buffer_atomic_inc v1, off, ttmp[8:11], 56  glc ; encoding: [0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8]
 0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8
+
+#===----------------------------------------------------------------------===#
+# Trap Handler related - 8-dword registers
+#===----------------------------------------------------------------------===#
+
+# VI:   s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0 ; encoding: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00
+
+# VI:   s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0 ; encoding: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00
+
+# VI:   s_load_dwordx8 ttmp[0:7], s[0:1], s0 ; encoding: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00
+
+# VI:   s_load_dwordx8 ttmp[4:11], s[0:1], s0 ; encoding: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00]
+0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00
diff --git a/test/MC/Disassembler/ARM/dfb-arm.txt b/test/MC/Disassembler/ARM/dfb-arm.txt
new file mode 100644
index 000000000000..26f81621274c
--- /dev/null
+++ b/test/MC/Disassembler/ARM/dfb-arm.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -disassemble -triple armv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DFB
+# RUN: llvm-mc -disassemble -triple armv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s | FileCheck %s --check-prefix=CHECK-NODFB
+
+# CHECK-DFB:   dfb                             @ encoding: [0x4c,0xf0,0x7f,0xf5]
+# CHECK-NODFB: dsb     #0xc                    @ encoding: [0x4c,0xf0,0x7f,0xf5]
+[0x4c,0xf0,0x7f,0xf5]
diff --git a/test/MC/Disassembler/ARM/dfb-thumb.txt b/test/MC/Disassembler/ARM/dfb-thumb.txt
new file mode 100644
index 000000000000..aa8adc83c1f4
--- /dev/null
+++ b/test/MC/Disassembler/ARM/dfb-thumb.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -disassemble -triple thumbv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DFB
+# RUN: llvm-mc -disassemble -triple thumbv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s | FileCheck %s --check-prefix=CHECK-NODFB
+
+# CHECK-DFB:   dfb                             @ encoding: [0xbf,0xf3,0x4c,0x8f]
+# CHECK-NODFB: dsb     #0xc                    @ encoding: [0xbf,0xf3,0x4c,0x8f]
+[0xbf,0xf3,0x4c,0x8f]
diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index 4211721ec48b..cc05dfb6f896 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt
@@ -667,6 +667,9 @@
 # CHECK: prefetchw (%eax)
 0x0f 0x0d 0x08
 
+# CHECK: prefetchwt1 (%eax)
+0x0f 0x0d 0x10
+
 # CHECK: adcxl %eax, %eax
 0x66 0x0f 0x38 0xf6 0xc0
 
diff --git a/test/MC/ELF/align-nops.s b/test/MC/ELF/align-nops.s
index 5e3386823f26..32da3dbd8e82 100644
--- a/test/MC/ELF/align-nops.s
+++ b/test/MC/ELF/align-nops.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -sd | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - | llvm-readobj -s -sd | FileCheck %s
 
 // Test that we get optimal nops in text
     .text
diff --git a/test/MC/MachO/x86_32-optimal_nop.s b/test/MC/MachO/x86_32-optimal_nop.s
index 01d8a1f6eb2a..1bc9eff337c4 100644
--- a/test/MC/MachO/x86_32-optimal_nop.s
+++ b/test/MC/MachO/x86_32-optimal_nop.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | llvm-readobj -file-headers -s -sd -r -t -macho-segment -macho-dysymtab -macho-indirect-symbols | FileCheck %s
+// RUN: llvm-mc -triple i386-apple-darwin9 -mcpu=pentiumpro %s -filetype=obj -o - | llvm-readobj -file-headers -s -sd -r -t -macho-segment -macho-dysymtab -macho-indirect-symbols | FileCheck %s
 
 # 1 byte nop test
         .align 4, 0 # start with 16 byte alignment filled with zeros
diff --git a/test/MC/Mips/eva/invalid.s b/test/MC/Mips/eva/invalid.s
index 5c4cc3ddf631..1f7c2e6ac8c6 100644
--- a/test/MC/Mips/eva/invalid.s
+++ b/test/MC/Mips/eva/invalid.s
@@ -1,35 +1,36 @@
 # Instructions that are invalid
 #
 # RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips32r2 \
-# RUN:     -mattr==eva 2>%t1
+# RUN:     -mattr=+eva 2>%t1
 # RUN: FileCheck %s < %t1
 
     .set noat
-    cachee -1, 255($7) # CHECK: :[[@LINE]]:12: error: invalid operand for instruction
-    cachee 32, 255($7) # CHECK: :[[@LINE]]:12: error: invalid operand for instruction
-    prefe -1, 255($7)  # CHECK: :[[@LINE]]:11: error: invalid operand for instruction
-    prefe 32, 255($7)  # CHECK: :[[@LINE]]:11: error: invalid operand for instruction
+    cachee -1, 255($7) # CHECK: :[[@LINE]]:12: error: expected 5-bit unsigned immediate
+    cachee 32, 255($7) # CHECK: :[[@LINE]]:12: error: expected 5-bit unsigned immediate
+    prefe -1, 255($7)  # CHECK: :[[@LINE]]:11: error: expected 5-bit unsigned immediate
+    prefe 32, 255($7)  # CHECK: :[[@LINE]]:11: error: expected 5-bit unsigned immediate
     lle $33, 8($5)     # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
-    lle $4, 8($33)     # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    lle $4, 512($5)    # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    lle $4, -513($5)   # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+    lle $4, 8($33)     # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    lle $4, 512($5)    # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    lle $4, -513($5)   # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
     lwe $33, 8($5)     # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
-    lwe $4, 8($33)     # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    lwe $4, 512($5)    # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    lwe $4, -513($5)   # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+    lwe $4, 8($33)     # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    lwe $4, 512($5)    # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    lwe $4, -513($5)   # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
     sbe $33, 8($5)     # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
-    sbe $4, 8($33)     # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    sbe $4, 512($5)    # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    sbe $4, -513($5)   # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+    sbe $4, 8($33)     # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    sbe $4, 512($5)    # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    sbe $4, -513($5)   # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
     sce $33, 8($5)     # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
-    sce $4, 8($33)     # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    sce $4, 512($5)    # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    sce $4, -513($5)   # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+    sce $4, 8($33)     # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    sce $4, 512($5)    # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    sce $4, -513($5)   # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
     she $33, 8($5)     # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
-    she $4, 8($33)     # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    she $4, 512($5)    # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    she $4, -513($5)   # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+    she $4, 8($33)     # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    she $4, 512($5)    # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    she $4, -513($5)   # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
     swe $33, 8($4)     # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
-    swe $5, 8($34)     # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    swe $5, 512($4)    # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-    swe $5, -513($4)   # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+    swe $5, 8($34)     # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    swe $5, 512($4)    # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+    swe $5, -513($4)   # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset
+
diff --git a/test/MC/WebAssembly/weak-alias.ll b/test/MC/WebAssembly/weak-alias.ll
index 5f5d9b447c71..29eeac02da66 100644
--- a/test/MC/WebAssembly/weak-alias.ll
+++ b/test/MC/WebAssembly/weak-alias.ll
@@ -8,21 +8,41 @@
 
 @bar = global i32 7, align 8
 @bar_alias = weak hidden alias i32, i32* @bar
-@bar_alias_address = global i32* @bar_alias, align 8
-
 @foo_alias = weak hidden alias i32 (), i32 ()* @foo
 
+@direct_address = global i32()* @foo, align 8
+@alias_address = global i32()* @foo_alias, align 8
+
+define hidden i32 @foo() #0 {
+entry:
+  ret i32 0
+}
+
+define hidden i32 @call_direct() #0 {
+entry:
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
 define hidden i32 @call_alias() #0 {
 entry:
   %call = call i32 @foo_alias()
   ret i32 %call
 }
 
-define hidden i32 @foo() #0 {
+define hidden i32 @call_direct_ptr() #0 {
 entry:
-  ret i32 0
+  %0 = load i32 ()*, i32 ()** @direct_address, align 8
+  %call = call i32 %0()
+  ret i32 %call
 }
 
+define hidden i32 @call_alias_ptr() #0 {
+entry:
+  %0 = load i32 ()*, i32 ()** @alias_address, align 8
+  %call = call i32 %0()
+  ret i32 %call
+}
 
 ; CHECK:        - Type:            TYPE
 ; CHECK-NEXT:     Signatures:      
@@ -42,47 +62,112 @@ entry:
 ; CHECK-NEXT:         Table:
 ; CHECK-NEXT:           ElemType:        ANYFUNC
 ; CHECK-NEXT:           Limits:
-; CHECK-NEXT:             Initial:         0x00000000
+; CHECK-NEXT:             Initial:         0x00000002
+; CHECK-NEXT:       - Module:          env
+; CHECK-NEXT:         Field:           foo_alias
+; CHECK-NEXT:         Kind:            FUNCTION
+; CHECK-NEXT:         SigIndex:        0
+; CHECK-NEXT:       - Module:          env
+; CHECK-NEXT:         Field:           bar_alias
+; CHECK-NEXT:         Kind:            GLOBAL
+; CHECK-NEXT:         GlobalType:      I32
+; CHECK-NEXT:         GlobalMutable:   false
 ; CHECK-NEXT:   - Type:            FUNCTION
-; CHECK-NEXT:     FunctionTypes:   [ 0, 0 ]
+; CHECK-NEXT:     FunctionTypes:   [ 0, 0, 0, 0, 0 ]
 ; CHECK-NEXT:   - Type:            GLOBAL
 ; CHECK-NEXT:     Globals:         
 ; CHECK-NEXT:       - Type:            I32
 ; CHECK-NEXT:         Mutable:         false
 ; CHECK-NEXT:         InitExpr:        
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           0
+; CHECK-NEXT:           Value:           8
 ; CHECK-NEXT:       - Type:            I32
 ; CHECK-NEXT:         Mutable:         false
 ; CHECK-NEXT:         InitExpr:        
 ; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           8
+; CHECK-NEXT:           Value:           16
+; CHECK-NEXT:       - Type:            I32
+; CHECK-NEXT:         Mutable:         false
+; CHECK-NEXT:         InitExpr:        
+; CHECK-NEXT:           Opcode:          I32_CONST
+; CHECK-NEXT:           Value:           0
 ; CHECK-NEXT:   - Type:            EXPORT
 ; CHECK-NEXT:     Exports:         
+; CHECK-NEXT:       - Name:            foo
+; CHECK-NEXT:         Kind:            FUNCTION
+; CHECK-NEXT:         Index:           1
+; CHECK-NEXT:       - Name:            call_direct
+; CHECK-NEXT:         Kind:            FUNCTION
+; CHECK-NEXT:         Index:           2
 ; CHECK-NEXT:       - Name:            call_alias
 ; CHECK-NEXT:         Kind:            FUNCTION
-; CHECK-NEXT:         Index:           0
-; CHECK-NEXT:       - Name:            foo
+; CHECK-NEXT:         Index:           3
+; CHECK-NEXT:       - Name:            call_direct_ptr
 ; CHECK-NEXT:         Kind:            FUNCTION
+; CHECK-NEXT:         Index:           4
+; CHECK-NEXT:       - Name:            direct_address
+; CHECK-NEXT:         Kind:            GLOBAL
 ; CHECK-NEXT:         Index:           1
-; CHECK-NEXT:       - Name:            bar
+; CHECK-NEXT:       - Name:            call_alias_ptr
+; CHECK-NEXT:         Kind:            FUNCTION
+; CHECK-NEXT:         Index:           5
+; CHECK-NEXT:       - Name:            alias_address
 ; CHECK-NEXT:         Kind:            GLOBAL
-; CHECK-NEXT:         Index:           0
-; CHECK-NEXT:       - Name:            bar_alias_address
+; CHECK-NEXT:         Index:           2
+; CHECK-NEXT:       - Name:            bar
 ; CHECK-NEXT:         Kind:            GLOBAL
-; CHECK-NEXT:         Index:           1
+; CHECK-NEXT:         Index:           3
 ; CHECK-NEXT:       - Name:            foo_alias
 ; CHECK-NEXT:         Kind:            FUNCTION
 ; CHECK-NEXT:         Index:           1
 ; CHECK-NEXT:       - Name:            bar_alias
 ; CHECK-NEXT:         Kind:            GLOBAL
+; CHECK-NEXT:         Index:           3
+; CHECK-NEXT:   - Type:            ELEM
+; CHECK-NEXT:     Segments:        
+; CHECK-NEXT:       - Offset:          
+; CHECK-NEXT:           Opcode:          I32_CONST
+; CHECK-NEXT:           Value:           0
+; CHECK-NEXT:         Functions:       [ 1, 0 ]
+; CHECK-NEXT:   - Type:            CODE
+; CHECK-NEXT:     Relocations:     
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_FUNCTION_INDEX_LEB
+; CHECK-NEXT:         Index:           1
+; CHECK-NEXT:         Offset:          0x00000009
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_FUNCTION_INDEX_LEB
 ; CHECK-NEXT:         Index:           0
-
-; CHECK:        - Type:            DATA
+; CHECK-NEXT:         Offset:          0x00000012
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_MEMORY_ADDR_LEB
+; CHECK-NEXT:         Index:           1
+; CHECK-NEXT:         Offset:          0x0000001E
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_TYPE_INDEX_LEB
+; CHECK-NEXT:         Index:           0
+; CHECK-NEXT:         Offset:          0x00000024
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_MEMORY_ADDR_LEB
+; CHECK-NEXT:         Index:           2
+; CHECK-NEXT:         Offset:          0x00000031
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_TYPE_INDEX_LEB
+; CHECK-NEXT:         Index:           0
+; CHECK-NEXT:         Offset:          0x00000037
+; CHECK-NEXT:     Functions:       
+; CHECK-NEXT:       - Locals:          
+; CHECK-NEXT:         Body:            41000B
+; CHECK-NEXT:       - Locals:          
+; CHECK-NEXT:         Body:            1081808080000B
+; CHECK-NEXT:       - Locals:          
+; CHECK-NEXT:         Body:            1080808080000B
+; CHECK-NEXT:       - Locals:          
+; CHECK-NEXT:         Body:            410028028880808000118080808000000B
+; CHECK-NEXT:       - Locals:          
+; CHECK-NEXT:         Body:            410028029080808000118080808000000B
+; CHECK-NEXT:   - Type:            DATA
 ; CHECK-NEXT:     Relocations:     
-; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_MEMORY_ADDR_I32
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_TABLE_INDEX_I32
 ; CHECK-NEXT:         Index:           0
 ; CHECK-NEXT:         Offset:          0x0000000F
+; CHECK-NEXT:       - Type:            R_WEBASSEMBLY_TABLE_INDEX_I32
+; CHECK-NEXT:         Index:           1
+; CHECK-NEXT:         Offset:          0x00000018
 ; CHECK-NEXT:     Segments:        
 ; CHECK-NEXT:       - SectionOffset:   6
 ; CHECK-NEXT:         MemoryIndex:     0
@@ -101,38 +186,64 @@ entry:
 ; CHECK-NEXT:     Name:            name
 ; CHECK-NEXT:     FunctionNames:   
 ; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Name:            call_alias
+; CHECK-NEXT:         Name:            foo_alias
 ; CHECK-NEXT:       - Index:           1
 ; CHECK-NEXT:         Name:            foo
+; CHECK-NEXT:       - Index:           2
+; CHECK-NEXT:         Name:            call_direct
+; CHECK-NEXT:       - Index:           3
+; CHECK-NEXT:         Name:            call_alias
+; CHECK-NEXT:       - Index:           4
+; CHECK-NEXT:         Name:            call_direct_ptr
+; CHECK-NEXT:       - Index:           5
+; CHECK-NEXT:         Name:            call_alias_ptr
 ; CHECK-NEXT:   - Type:            CUSTOM
 ; CHECK-NEXT:     Name:            linking
-; CHECK-NEXT:     DataSize:        12
+; CHECK-NEXT:     DataSize:        20
 ; CHECK-NEXT:     SymbolInfo:      
-; CHECK-NEXT:       - Name:            call_alias
-; CHECK-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
-; CHECK-NEXT:       - Name:            foo
-; CHECK-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
 ; CHECK-NEXT:       - Name:            foo_alias
 ; CHECK-NEXT:         Flags:           [ BINDING_WEAK, VISIBILITY_HIDDEN ]
 ; CHECK-NEXT:       - Name:            bar_alias
 ; CHECK-NEXT:         Flags:           [ BINDING_WEAK, VISIBILITY_HIDDEN ]
+; CHECK-NEXT:       - Name:            foo
+; CHECK-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
+; CHECK-NEXT:       - Name:            call_direct
+; CHECK-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
+; CHECK-NEXT:       - Name:            call_alias
+; CHECK-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
+; CHECK-NEXT:       - Name:            call_direct_ptr
+; CHECK-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
+; CHECK-NEXT:       - Name:            call_alias_ptr
+; CHECK-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
 ; CHECK-NEXT:     SegmentInfo:    
 ; CHECK-NEXT:       - Index:           0
 ; CHECK-NEXT:         Name:            .data.bar
 ; CHECK-NEXT:         Alignment:       8
 ; CHECK-NEXT:         Flags:           [ ]
 ; CHECK-NEXT:       - Index:           1
-; CHECK-NEXT:         Name:            .data.bar_alias_address
+; CHECK-NEXT:         Name:            .data.direct_address
+; CHECK-NEXT:         Alignment:       8
+; CHECK-NEXT:         Flags:           [ ]
+; CHECK-NEXT:       - Index:           2
+; CHECK-NEXT:         Name:            .data.alias_address
 ; CHECK-NEXT:         Alignment:       8
 ; CHECK-NEXT:         Flags:           [ ]
 ; CHECK-NEXT: ...
 
 ; CHECK-SYMS: SYMBOL TABLE:
-; CHECK-SYMS-NEXT: 00000000 g     F name	call_alias
+; CHECK-SYMS-NEXT: 00000000 g     F name	foo_alias
 ; CHECK-SYMS-NEXT: 00000001 g     F name	foo
-; CHECK-SYMS-NEXT: 00000000 g     F EXPORT	.hidden call_alias
-; CHECK-SYMS-NEXT: 00000001 g     F EXPORT	.hidden foo
-; CHECK-SYMS-NEXT: 00000000 g       EXPORT	bar
-; CHECK-SYMS-NEXT: 00000008 g       EXPORT	bar_alias_address
+; CHECK-SYMS-NEXT: 00000002 g     F name	call_direct
+; CHECK-SYMS-NEXT: 00000003 g     F name	call_alias
+; CHECK-SYMS-NEXT: 00000004 g     F name	call_direct_ptr
+; CHECK-SYMS-NEXT: 00000005 g     F name	call_alias_ptr
 ; CHECK-SYMS-NEXT: 00000001 gw    F EXPORT	.hidden foo_alias
 ; CHECK-SYMS-NEXT: 00000000 gw      EXPORT	.hidden bar_alias
+; CHECK-SYMS-NEXT: 00000001 g     F EXPORT	.hidden foo
+; CHECK-SYMS-NEXT: 00000002 g     F EXPORT	.hidden call_direct
+; CHECK-SYMS-NEXT: 00000003 g     F EXPORT	.hidden call_alias
+; CHECK-SYMS-NEXT: 00000004 g     F EXPORT	.hidden call_direct_ptr
+; CHECK-SYMS-NEXT: 00000008 g       EXPORT	direct_address
+; CHECK-SYMS-NEXT: 00000005 g     F EXPORT	.hidden call_alias_ptr
+; CHECK-SYMS-NEXT: 00000010 g       EXPORT	alias_address
+; CHECK-SYMS-NEXT: 00000000 g       EXPORT	bar
diff --git a/test/MC/X86/3DNow.s b/test/MC/X86/3DNow.s
index 871857b155d0..e66e39b547ae 100644
--- a/test/MC/X86/3DNow.s
+++ b/test/MC/X86/3DNow.s
@@ -72,8 +72,10 @@ femms
 
 // CHECK: prefetch (%rax)   # encoding: [0x0f,0x0d,0x00]
 // CHECK: prefetchw (%rax)  # encoding: [0x0f,0x0d,0x08]
+// CHECK: prefetchwt1 (%rax)  # encoding: [0x0f,0x0d,0x10]
 prefetch (%rax)
 prefetchw (%rax)
+prefetchwt1 (%rax)
 
 
 // CHECK: pf2iw %mm2, %mm1  # encoding: [0x0f,0x0f,0xca,0x1c]
diff --git a/test/MC/X86/AlignedBundling/different-sections.s b/test/MC/X86/AlignedBundling/different-sections.s
index e12153210042..0af4a8f133e6 100644
--- a/test/MC/X86/AlignedBundling/different-sections.s
+++ b/test/MC/X86/AlignedBundling/different-sections.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
 
 # Test two different executable sections with bundling.
diff --git a/test/MC/X86/AlignedBundling/long-nop-pad.s b/test/MC/X86/AlignedBundling/long-nop-pad.s
index 36e4f4b553fb..bcc319456d35 100644
--- a/test/MC/X86/AlignedBundling/long-nop-pad.s
+++ b/test/MC/X86/AlignedBundling/long-nop-pad.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
 
 # Test that long nops are generated for padding where possible.
diff --git a/test/MC/X86/AlignedBundling/misaligned-bundle-group.s b/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
index 04b3374716bb..edd933ed720d 100644
--- a/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
+++ b/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - \
 # RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - \
 # RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s
 
diff --git a/test/MC/X86/AlignedBundling/misaligned-bundle.s b/test/MC/X86/AlignedBundling/misaligned-bundle.s
index 08d616109909..676b667cada5 100644
--- a/test/MC/X86/AlignedBundling/misaligned-bundle.s
+++ b/test/MC/X86/AlignedBundling/misaligned-bundle.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - \
 # RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - \
 # RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s
 
diff --git a/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s b/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
index 158cde8cd450..8605b7c0137d 100644
--- a/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
+++ b/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
 
 # Test some variations of padding to the end of a bundle.
diff --git a/test/MC/X86/AlignedBundling/pad-bundle-groups.s b/test/MC/X86/AlignedBundling/pad-bundle-groups.s
index 7a9e30c053ec..5993d73dd6dd 100644
--- a/test/MC/X86/AlignedBundling/pad-bundle-groups.s
+++ b/test/MC/X86/AlignedBundling/pad-bundle-groups.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
 
 # Test some variations of padding for bundle-locked groups.
diff --git a/test/MC/X86/AlignedBundling/relax-in-bundle-group.s b/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
index d07619063f00..036249b906b4 100644
--- a/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
+++ b/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
 # RUN:   | llvm-objdump -disassemble - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble - | FileCheck %s
 
 # Test that instructions inside bundle-locked groups are relaxed even if their
diff --git a/test/MC/X86/AlignedBundling/single-inst-bundling.s b/test/MC/X86/AlignedBundling/single-inst-bundling.s
index a7df2c96a8eb..cb0ad8adba39 100644
--- a/test/MC/X86/AlignedBundling/single-inst-bundling.s
+++ b/test/MC/X86/AlignedBundling/single-inst-bundling.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
 # RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s
 
 # Test simple NOP insertion for single instructions.
diff --git a/test/MC/X86/CLFLUSHOPT-32.s b/test/MC/X86/CLFLUSHOPT-32.s
new file mode 100644
index 000000000000..e3df46ced7b0
--- /dev/null
+++ b/test/MC/X86/CLFLUSHOPT-32.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple i386-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: clflushopt -485498096(%edx,%eax,4) 
+// CHECK: encoding: [0x66,0x0f,0xae,0xbc,0x82,0x10,0xe3,0x0f,0xe3]         
+clflushopt -485498096(%edx,%eax,4) 
+
+// CHECK: clflushopt 485498096(%edx,%eax,4) 
+// CHECK: encoding: [0x66,0x0f,0xae,0xbc,0x82,0xf0,0x1c,0xf0,0x1c]         
+clflushopt 485498096(%edx,%eax,4) 
+
+// CHECK: clflushopt 485498096(%edx) 
+// CHECK: encoding: [0x66,0x0f,0xae,0xba,0xf0,0x1c,0xf0,0x1c]         
+clflushopt 485498096(%edx) 
+
+// CHECK: clflushopt 485498096 
+// CHECK: encoding: [0x66,0x0f,0xae,0x3d,0xf0,0x1c,0xf0,0x1c]         
+clflushopt 485498096 
+
+// CHECK: clflushopt 64(%edx,%eax) 
+// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x02,0x40]         
+clflushopt 64(%edx,%eax) 
+
+// CHECK: clflushopt (%edx) 
+// CHECK: encoding: [0x66,0x0f,0xae,0x3a]         
+clflushopt (%edx) 
+
diff --git a/test/MC/X86/CLFLUSHOPT-64.s b/test/MC/X86/CLFLUSHOPT-64.s
new file mode 100644
index 000000000000..cdecced6fa2e
--- /dev/null
+++ b/test/MC/X86/CLFLUSHOPT-64.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: clflushopt 485498096 
+// CHECK: encoding: [0x66,0x0f,0xae,0x3c,0x25,0xf0,0x1c,0xf0,0x1c]         
+clflushopt 485498096 
+
+// CHECK: clflushopt 64(%rdx) 
+// CHECK: encoding: [0x66,0x0f,0xae,0x7a,0x40]         
+clflushopt 64(%rdx) 
+
+// CHECK: clflushopt 64(%rdx,%rax,4) 
+// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x82,0x40]         
+clflushopt 64(%rdx,%rax,4) 
+
+// CHECK: clflushopt -64(%rdx,%rax,4) 
+// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x82,0xc0]         
+clflushopt -64(%rdx,%rax,4) 
+
+// CHECK: clflushopt 64(%rdx,%rax) 
+// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x02,0x40]         
+clflushopt 64(%rdx,%rax) 
+
+// CHECK: clflushopt (%rdx) 
+// CHECK: encoding: [0x66,0x0f,0xae,0x3a]         
+clflushopt (%rdx) 
+
diff --git a/test/MC/X86/CLFSH-32.s b/test/MC/X86/CLFSH-32.s
new file mode 100644
index 000000000000..898569ec2df6
--- /dev/null
+++ b/test/MC/X86/CLFSH-32.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple i386-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: clflush -485498096(%edx,%eax,4) 
+// CHECK: encoding: [0x0f,0xae,0xbc,0x82,0x10,0xe3,0x0f,0xe3]         
+clflush -485498096(%edx,%eax,4) 
+
+// CHECK: clflush 485498096(%edx,%eax,4) 
+// CHECK: encoding: [0x0f,0xae,0xbc,0x82,0xf0,0x1c,0xf0,0x1c]         
+clflush 485498096(%edx,%eax,4) 
+
+// CHECK: clflush 485498096(%edx) 
+// CHECK: encoding: [0x0f,0xae,0xba,0xf0,0x1c,0xf0,0x1c]         
+clflush 485498096(%edx) 
+
+// CHECK: clflush 485498096 
+// CHECK: encoding: [0x0f,0xae,0x3d,0xf0,0x1c,0xf0,0x1c]         
+clflush 485498096 
+
+// CHECK: clflush 64(%edx,%eax) 
+// CHECK: encoding: [0x0f,0xae,0x7c,0x02,0x40]         
+clflush 64(%edx,%eax) 
+
+// CHECK: clflush (%edx) 
+// CHECK: encoding: [0x0f,0xae,0x3a]         
+clflush (%edx) 
+
diff --git a/test/MC/X86/CLFSH-64.s b/test/MC/X86/CLFSH-64.s
new file mode 100644
index 000000000000..f2c2ae51c81b
--- /dev/null
+++ b/test/MC/X86/CLFSH-64.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: clflush 485498096 
+// CHECK: encoding: [0x0f,0xae,0x3c,0x25,0xf0,0x1c,0xf0,0x1c]         
+clflush 485498096 
+
+// CHECK: clflush 64(%rdx) 
+// CHECK: encoding: [0x0f,0xae,0x7a,0x40]         
+clflush 64(%rdx) 
+
+// CHECK: clflush 64(%rdx,%rax,4) 
+// CHECK: encoding: [0x0f,0xae,0x7c,0x82,0x40]         
+clflush 64(%rdx,%rax,4) 
+
+// CHECK: clflush -64(%rdx,%rax,4) 
+// CHECK: encoding: [0x0f,0xae,0x7c,0x82,0xc0]         
+clflush -64(%rdx,%rax,4) 
+
+// CHECK: clflush 64(%rdx,%rax) 
+// CHECK: encoding: [0x0f,0xae,0x7c,0x02,0x40]         
+clflush 64(%rdx,%rax) 
+
+// CHECK: clflush (%rdx) 
+// CHECK: encoding: [0x0f,0xae,0x3a]         
+clflush (%rdx) 
+
diff --git a/test/MC/X86/x86_long_nop.s b/test/MC/X86/x86_long_nop.s
index fa8525b44c22..7fa553692ea0 100644
--- a/test/MC/X86/x86_long_nop.s
+++ b/test/MC/X86/x86_long_nop.s
@@ -1,7 +1,7 @@
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=pentiumpro | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=slm %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=silvermont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=lakemont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=NOP1 %s
diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td
index f9fe51f84380..bc0b509622c4 100644
--- a/test/TableGen/GlobalISelEmitter.td
+++ b/test/TableGen/GlobalISelEmitter.td
@@ -63,11 +63,14 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; }
 // CHECK-NEXT:    typedef ComplexRendererFns(MyTargetInstructionSelector::*ComplexMatcherMemFn)(MachineOperand &) const;
 // CHECK-NEXT:    const MatcherInfoTy<PredicateBitset, ComplexMatcherMemFn> MatcherInfo;
 // CHECK-NEXT:    static MyTargetInstructionSelector::ComplexMatcherMemFn ComplexPredicateFns[];
+// CHECK-NEXT:    bool testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const override;
+// CHECK-NEXT:    bool testImmPredicate_APInt(unsigned PredicateID, const APInt &Imm) const override;
+// CHECK-NEXT:    bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat &Imm) const override;
 // CHECK-NEXT:  #endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL
 
 // CHECK-LABEL: #ifdef GET_GLOBALISEL_TEMPORARIES_INIT
 // CHECK-NEXT:    , State(2),
-// CHECK-NEXT:    MatcherInfo({TypeObjects, FeatureBitsets, I64ImmPredicateFns, APIntImmPredicateFns, APFloatImmPredicateFns, ComplexPredicateFns})
+// CHECK-NEXT:    MatcherInfo({TypeObjects, FeatureBitsets, ComplexPredicateFns})
 // CHECK-NEXT:  #endif // ifdef GET_GLOBALISEL_TEMPORARIES_INIT
 
 // CHECK-LABEL: enum SubtargetFeatureBits : uint8_t {
@@ -127,31 +130,49 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; }
 // CHECK-NEXT:  enum {
 // CHECK-NEXT:    GIPFP_I64_Predicate_simm8 = GIPFP_I64_Invalid + 1,
 // CHECK-NEXT:  };
-// CHECK-NEXT:  static bool Predicate_simm8(int64_t Imm) { return isInt<8>(Imm);   }
-// CHECK-NEXT:  static InstructionSelector::I64ImmediatePredicateFn I64ImmPredicateFns[] = {
-// CHECK-NEXT:    nullptr,
-// CHECK-NEXT:    Predicate_simm8,
-// CHECK-NEXT:  };
+// CHECK-NEXT:  bool MyTargetInstructionSelector::testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const {
+// CHECK-NEXT:    switch (PredicateID) {
+// CHECK-NEXT:    case GIPFP_I64_Predicate_simm8: {
+// CHECK-NEXT:      return isInt<8>(Imm);
+// CHECK-NEXT:      llvm_unreachable("ImmediateCode should have returned");
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    }
+// CHECK-NEXT:    llvm_unreachable("Unknown predicate");
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }
 
 // CHECK-LABEL: // PatFrag predicates.
 // CHECK-NEXT:  enum {
 // CHECK-NEXT:    GIPFP_APFloat_Predicate_fpimmz = GIPFP_APFloat_Invalid + 1,
 // CHECK-NEXT:  };
-// CHECK-NEXT:  static bool Predicate_fpimmz(const APFloat & Imm) { return Imm->isExactlyValue(0.0); }
-// CHECK-NEXT:  static InstructionSelector::APFloatImmediatePredicateFn APFloatImmPredicateFns[] = {
-// CHECK-NEXT:    nullptr,
-// CHECK-NEXT:    Predicate_fpimmz,
-// CHECK-NEXT:  };
+// CHECK-NEXT:  bool MyTargetInstructionSelector::testImmPredicate_APFloat(unsigned PredicateID, const APFloat & Imm) const {
+// CHECK-NEXT:    switch (PredicateID) {
+// CHECK-NEXT:    case GIPFP_APFloat_Predicate_fpimmz: {
+// CHECK-NEXT:      return Imm->isExactlyValue(0.0);
+// CHECK-NEXT:      llvm_unreachable("ImmediateCode should have returned");
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    }
+// CHECK-NEXT:    llvm_unreachable("Unknown predicate");
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }
 
 // CHECK-LABEL: // PatFrag predicates.
 // CHECK-NEXT:  enum {
 // CHECK-NEXT:    GIPFP_APInt_Predicate_simm9 = GIPFP_APInt_Invalid + 1,
 // CHECK-NEXT:  };
-// CHECK-NEXT:  static bool Predicate_simm9(const APInt & Imm) { return isInt<9>(Imm->getSExtValue());   }
-// CHECK-NEXT:  static InstructionSelector::APIntImmediatePredicateFn APIntImmPredicateFns[] = {
-// CHECK-NEXT:    nullptr,
-// CHECK-NEXT:    Predicate_simm9,
-// CHECK-NEXT:  };
+// CHECK-NEXT:  bool MyTargetInstructionSelector::testImmPredicate_APInt(unsigned PredicateID, const APInt & Imm) const {
+// CHECK-NEXT:    switch (PredicateID) {
+// CHECK-NEXT:    case GIPFP_APInt_Predicate_simm9: {
+// CHECK-NEXT:      return isInt<9>(Imm->getSExtValue());
+// CHECK-NEXT:      llvm_unreachable("ImmediateCode should have returned");
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    }
+// CHECK-NEXT:    llvm_unreachable("Unknown predicate");
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }
 
 // CHECK-LABEL: MyTargetInstructionSelector::ComplexMatcherMemFn
 // CHECK-NEXT:  MyTargetInstructionSelector::ComplexPredicateFns[] = {
diff --git a/test/TableGen/intrinsic-long-name.td b/test/TableGen/intrinsic-long-name.td
index 24ed89ac4acf..9bbfe9829a65 100644
--- a/test/TableGen/intrinsic-long-name.td
+++ b/test/TableGen/intrinsic-long-name.td
@@ -2,6 +2,7 @@
 // XFAIL: vg_leak
 
 class IntrinsicProperty;
+class SDNodeProperty;
 
 class ValueType<int size, int value> {
   string Namespace = "MVT";
@@ -20,6 +21,7 @@ class Intrinsic<string name, list<LLVMType> param_types = []> {
   list<LLVMType> RetTypes = [];
   list<LLVMType> ParamTypes = param_types;
   list<IntrinsicProperty> IntrProperties = [];
+  list<SDNodeProperty> Properties = [];
 }
 
 def iAny : ValueType<0, 253>;
diff --git a/test/TableGen/intrinsic-struct.td b/test/TableGen/intrinsic-struct.td
index 93737b14db2a..1f1a8c2c8220 100644
--- a/test/TableGen/intrinsic-struct.td
+++ b/test/TableGen/intrinsic-struct.td
@@ -2,6 +2,7 @@
 // XFAIL: vg_leak
 
 class IntrinsicProperty;
+class SDNodeProperty;
 
 class ValueType<int size, int value> {
   string Namespace = "MVT";
@@ -20,6 +21,7 @@ class Intrinsic<string name, list<LLVMType> ret_types = []> {
   list<LLVMType> RetTypes = ret_types;
   list<LLVMType> ParamTypes = [];
   list<IntrinsicProperty> IntrProperties = [];
+  list<SDNodeProperty> Properties = [];
 }
 
 def iAny : ValueType<0, 253>;
diff --git a/test/TableGen/intrinsic-varargs.td b/test/TableGen/intrinsic-varargs.td
index 4b2cc5ae02f7..484364779012 100644
--- a/test/TableGen/intrinsic-varargs.td
+++ b/test/TableGen/intrinsic-varargs.td
@@ -2,6 +2,7 @@
 // XFAIL: vg_leak
 
 class IntrinsicProperty;
+class SDNodeProperty;
 
 class ValueType<int size, int value> {
   string Namespace = "MVT";
@@ -20,6 +21,7 @@ class Intrinsic<string name, list<LLVMType> param_types = []> {
   list<LLVMType> RetTypes = [];
   list<LLVMType> ParamTypes = param_types;
   list<IntrinsicProperty> IntrProperties = [];
+  list<SDNodeProperty> Properties = [];
 }
 
 // isVoid needs to match the definition in ValueTypes.td
diff --git a/test/ThinLTO/X86/cache.ll b/test/ThinLTO/X86/cache.ll
index ea5c2f98d876..75466442d786 100644
--- a/test/ThinLTO/X86/cache.ll
+++ b/test/ThinLTO/X86/cache.ll
@@ -5,7 +5,7 @@
 
 ; Verify that enabling caching is ignoring module without hash
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
-; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc  %t.bc -thinlto-cache-dir %t.cache
+; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
 ; RUN: ls %t.cache/llvmcache.timestamp
 ; RUN: ls %t.cache | count 1
 
@@ -27,7 +27,7 @@
 ; files matching the pattern "llvmcache-*".
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
 ; RUN: touch -t 197001011200 %t.cache/llvmcache-foo %t.cache/foo
-; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc  %t.bc -thinlto-cache-dir %t.cache
+; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
 ; RUN: ls %t.cache | count 4
 ; RUN: ls %t.cache/llvmcache.timestamp
 ; RUN: ls %t.cache/foo
@@ -36,13 +36,29 @@
 
 ; Verify that enabling caching is working with llvm-lto2
 ; RUN: rm -Rf %t.cache
-; RUN: llvm-lto2 run -o %t.o %t2.bc  %t.bc -cache-dir %t.cache \
+; RUN: llvm-lto2 run -o %t.o %t2.bc %t.bc -cache-dir %t.cache \
 ; RUN:  -r=%t2.bc,_main,plx \
 ; RUN:  -r=%t2.bc,_globalfunc,lx \
 ; RUN:  -r=%t.bc,_globalfunc,plx
 ; RUN: ls %t.cache | count 2
 ; RUN: ls %t.cache/llvmcache-* | count 2
 
+; Verify that caches with a timestamp older than the pruning interval
+; will be pruned
+; RUN: rm -Rf %t.cache && mkdir %t.cache
+; RUN: touch -t 197001011200 %t.cache/llvmcache-foo
+; RUN: touch -t 197001011200 %t.cache/llvmcache.timestamp
+; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache
+; RUN: not ls %t.cache/llvmcache-foo
+
+; Verify that specifying a negative number for the pruning interval
+; effectively disables the pruning
+; RUN: rm -Rf %t.cache && mkdir %t.cache
+; RUN: touch -t 197001011200 %t.cache/llvmcache-foo
+; RUN: touch -t 197001011200 %t.cache/llvmcache.timestamp
+; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache --thinlto-cache-pruning-interval -1
+; RUN: ls %t.cache/llvmcache-foo
+
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
diff --git a/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll b/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
new file mode 100644
index 000000000000..e10b04a850af
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll
@@ -0,0 +1,139 @@
+; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s  -passes='function(callsite-splitting)' -S | FileCheck %s
+
+; CHECK-LABEL: @test_simple
+; CHECK-LABEL: Header:
+; CHECK-NEXT: br i1 undef, label %Tail.predBB1.split
+; CHECK-LABEL: TBB:
+; CHECK: br i1 %cmp, label %Tail.predBB2.split
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 %p)
+; CHECK-LABEL: Tail.predBB2.split:
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 %p)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_simple(i32* %a, i32 %v, i32 %p) {
+Header:
+  br i1 undef, label %Tail, label %End
+
+TBB:
+  %cmp = icmp eq i32* %a, null
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+; CHECK-LABEL: @test_eq_eq_eq_untaken
+; CHECK-LABEL: Header:
+; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB1.split
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
+; CHECK-LABEL: Tail.predBB2.split:
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 99)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq_eq_untaken2(i32* %a, i32 %v, i32 %p) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %TBB1, label %Tail
+
+TBB1:
+  %cmp1 = icmp eq i32 %v, 1
+  br i1 %cmp1, label %TBB2, label %End
+
+TBB2:
+  %cmp2 = icmp eq i32 %p, 99
+  br i1 %cmp2, label %Tail, label %End
+
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+; CHECK-LABEL: @test_eq_ne_eq_untaken
+; CHECK-LABEL: Header:
+; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB1.split
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p)
+; CHECK-LABEL: Tail.predBB2.split:
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 99)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_ne_eq_untaken(i32* %a, i32 %v, i32 %p) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %TBB1, label %Tail
+
+TBB1:
+  %cmp1 = icmp ne i32 %v, 1
+  br i1 %cmp1, label %TBB2, label %End
+
+TBB2:
+  %cmp2 = icmp eq i32 %p, 99
+  br i1 %cmp2, label %Tail, label %End
+
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+; CHECK-LABEL: @test_header_header2_tbb
+; CHECK: Header2:
+; CHECK:br i1 %tobool2, label %Tail.predBB1.split, label %TBB1
+; CHECK-LABEL: TBB2:
+; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End
+; CHECK-LABEL: Tail.predBB1.split:
+; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 10)
+; CHECK-LABEL: Tail.predBB2.split:
+; NOTE: CallSiteSplitting cannot infer that %a is null here, as it currently
+;       only supports recording conditions along a single predecessor path.
+; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 99)
+; CHECK-LABEL: Tail
+; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+; CHECK: ret i32 %[[MERGED]]
+define i32 @test_header_header2_tbb(i32* %a, i32 %v, i32 %p) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %TBB1, label %Header2
+
+Header2:
+  %tobool2 = icmp eq i32 %p, 10
+  br i1 %tobool2, label %Tail, label %TBB1
+
+TBB1:
+  %cmp1 = icmp eq i32 %v, 1
+  br i1 %cmp1, label %TBB2, label %End
+
+TBB2:
+  %cmp2 = icmp eq i32 %p, 99
+  br i1 %cmp2, label %Tail, label %End
+
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+define i32 @callee(i32* %a, i32 %v, i32 %p) {
+  ret i32 10
+}
diff --git a/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll b/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll
new file mode 100644
index 000000000000..ca41bd6fc5e1
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll
@@ -0,0 +1,18 @@
+; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s  -passes='function(callsite-splitting)' -S | FileCheck %s
+
+define i32 @callee(i32*, i32, i32) {
+  ret i32 10
+}
+
+; CHECK-LABEL: @test_preds_equal
+; CHECK-NOT: split
+; CHECK: br i1 %cmp, label %Tail, label %Tail
+define i32 @test_preds_equal(i32* %a, i32 %v, i32 %p) {
+TBB:
+  %cmp = icmp eq i32* %a, null
+  br i1 %cmp, label %Tail, label %Tail
+Tail:
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+}
diff --git a/test/Transforms/CodeGenPrepare/section.ll b/test/Transforms/CodeGenPrepare/section.ll
index 4f3144e7fc73..30598ba7afbe 100644
--- a/test/Transforms/CodeGenPrepare/section.ll
+++ b/test/Transforms/CodeGenPrepare/section.ll
@@ -4,33 +4,59 @@ target triple = "x86_64-pc-linux-gnu"
 
 ; This tests that hot/cold functions get correct section prefix assigned
 
-; CHECK: hot_func{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
+; CHECK: hot_func1{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
 ; The entry is hot
-define void @hot_func() !prof !15 {
+define void @hot_func1() !prof !15 {
   ret void
 }
 
-; For instrumentation based PGO, we should only look at entry counts,
+; CHECK: hot_func2{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
+; Entry is cold but inner block is hot
+define void @hot_func2(i32 %n) !prof !16 {
+entry:
+  %n.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %n, i32* %n.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %n.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end, !prof !19
+
+for.body:
+  %2 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+; For instrumentation based PGO, we should only look at block counts,
 ; not call site VP metadata (which can exist on value profiled memcpy,
 ; or possibly left behind after static analysis based devirtualization).
 ; CHECK: cold_func1{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
 define void @cold_func1() !prof !16 {
-  call void @hot_func(), !prof !17
-  call void @hot_func(), !prof !17
+  call void @hot_func1(), !prof !17
+  call void @hot_func1(), !prof !17
   ret void
 }
 
-; CHECK: cold_func2{{.*}}!section_prefix
+; CHECK: cold_func2{{.*}}!section_prefix ![[COLD_ID]]
 define void @cold_func2() !prof !16 {
-  call void @hot_func(), !prof !17
-  call void @hot_func(), !prof !18
-  call void @hot_func(), !prof !18
+  call void @hot_func1(), !prof !17
+  call void @hot_func1(), !prof !18
+  call void @hot_func1(), !prof !18
   ret void
 }
 
 ; CHECK: cold_func3{{.*}}!section_prefix ![[COLD_ID]]
 define void @cold_func3() !prof !16 {
-  call void @hot_func(), !prof !18
+  call void @hot_func1(), !prof !18
   ret void
 }
 
@@ -55,3 +81,4 @@ define void @cold_func3() !prof !16 {
 !16 = !{!"function_entry_count", i64 1}
 !17 = !{!"branch_weights", i32 80}
 !18 = !{!"branch_weights", i32 1}
+!19 = !{!"branch_weights", i32 1000, i32 1}
diff --git a/test/Transforms/GVN/tbaa.ll b/test/Transforms/GVN/tbaa.ll
index 7c05fda6cb8f..5cb4e0359970 100644
--- a/test/Transforms/GVN/tbaa.ll
+++ b/test/Transforms/GVN/tbaa.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -tbaa -basicaa -gvn -S < %s | FileCheck %s
 
 define i32 @test1(i8* %p, i8* %q) {
-; CHECK: @test1(i8* %p, i8* %q)
+; CHECK-LABEL: @test1(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p)
 ; CHECK-NOT: tbaa
 ; CHECK: %c = add i32 %a, %a
@@ -12,7 +12,7 @@ define i32 @test1(i8* %p, i8* %q) {
 }
 
 define i32 @test2(i8* %p, i8* %q) {
-; CHECK: @test2(i8* %p, i8* %q)
+; CHECK-LABEL: @test2(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGC:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
@@ -22,7 +22,7 @@ define i32 @test2(i8* %p, i8* %q) {
 }
 
 define i32 @test3(i8* %p, i8* %q) {
-; CHECK: @test3(i8* %p, i8* %q)
+; CHECK-LABEL: @test3(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGB:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !3
@@ -32,7 +32,7 @@ define i32 @test3(i8* %p, i8* %q) {
 }
 
 define i32 @test4(i8* %p, i8* %q) {
-; CHECK: @test4(i8* %p, i8* %q)
+; CHECK-LABEL: @test4(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !1
@@ -42,8 +42,8 @@ define i32 @test4(i8* %p, i8* %q) {
 }
 
 define i32 @test5(i8* %p, i8* %q) {
-; CHECK: @test5(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test5(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !1
@@ -52,8 +52,8 @@ define i32 @test5(i8* %p, i8* %q) {
 }
 
 define i32 @test6(i8* %p, i8* %q) {
-; CHECK: @test6(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test6(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !3
@@ -62,7 +62,7 @@ define i32 @test6(i8* %p, i8* %q) {
 }
 
 define i32 @test7(i8* %p, i8* %q) {
-; CHECK: @test7(i8* %p, i8* %q)
+; CHECK-LABEL: @test7(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p)
 ; CHECK-NOT: tbaa
 ; CHECK: %c = add i32 %a, %a
@@ -72,10 +72,8 @@ define i32 @test7(i8* %p, i8* %q) {
   ret i32 %c
 }
 
-
-
 define i32 @test8(i32* %p, i32* %q) {
-; CHECK-LABEL: test8
+; CHECK-LABEL: @test8
 ; CHECK-NEXT: store i32 15, i32* %p
 ; CHECK-NEXT: ret i32 0
 ; Since we know the location is invariant, we can forward the
@@ -87,8 +85,9 @@ define i32 @test8(i32* %p, i32* %q) {
   %c = sub i32 %a, %b
   ret i32 %c
 }
+
 define i32 @test9(i32* %p, i32* %q) {
-; CHECK-LABEL: test9
+; CHECK-LABEL: @test9
 ; CHECK-NEXT: call void @clobber()
 ; CHECK-NEXT: ret i32 0
 ; Since we know the location is invariant, we can forward the
@@ -101,16 +100,27 @@ define i32 @test9(i32* %p, i32* %q) {
   ret i32 %c
 }
 
+define i32 @test10(i8* %p, i8* %q) {
+; If one access encloses the other, then the merged access is the enclosed one
+; and not just the common final access type.
+; CHECK-LABEL: @test10
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAG_X_i:!.*]]
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !15  ; TAG_X_i
+  %b = call i32 @foo(i8* %p), !tbaa !19  ; TAG_Y_x_i
+  %c = add i32 %a, %b
+  ret i32 %c
+}
 
 declare void @clobber()
 declare i32 @foo(i8*) readonly
 
-; CHECK: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
-; CHECK: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
-; CHECK: [[TYPEA]] = !{!"A", !{{.*}}}
-; CHECK: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
-; CHECK: [[TYPEB]] = !{!"B", [[TYPEA]]}
-; CHECK: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
+; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
+; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
+; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}}
+; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
+; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]}
+; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
 !0 = !{!5, !5, i64 0}
 !1 = !{!6, !6, i64 0}
 !2 = !{!"tbaa root"}
@@ -122,8 +132,17 @@ declare i32 @foo(i8*) readonly
 !8 = !{!"another root"}
 !11 = !{!"scalar type", !8}
 
+; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0}
+; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0}
+; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0}
+!15 = !{!16, !17, i64 0}            ; TAG_X_i
+!16 = !{!"struct X", !17, i64 0}    ; struct X { int i; };
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"char", !2, i64 0}
 
-;; A TBAA structure who's only point is to have a constant location
+!19 = !{!20, !17, i64 0}            ; TAG_Y_x_i
+!20 = !{!"struct Y", !16, i64 0}    ; struct Y { struct X x; };
+
+; A TBAA structure who's only point is to have a constant location.
 !9 = !{!"yet another root"}
 !10 = !{!"node", !9, i64 1}
-
diff --git a/test/Transforms/Inline/AArch64/binop.ll b/test/Transforms/Inline/AArch64/binop.ll
new file mode 100644
index 000000000000..051528991e46
--- /dev/null
+++ b/test/Transforms/Inline/AArch64/binop.ll
@@ -0,0 +1,291 @@
+; RUN: opt -inline -mtriple=aarch64--linux-gnu -S -o - < %s -inline-threshold=0 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+declare void @pad()
+@glbl = external global i32
+
+define i32 @outer_add1(i32 %a) {
+; CHECK-LABEL: @outer_add1(
+; CHECK-NOT: call i32 @add
+  %C = call i32 @add(i32 %a, i32 0)
+  ret i32 %C
+}
+
+define i32 @outer_add2(i32 %a) {
+; CHECK-LABEL: @outer_add2(
+; CHECK-NOT: call i32 @add
+  %C = call i32 @add(i32 0, i32 %a)
+  ret i32 %C
+}
+
+define i32 @add(i32 %a, i32 %b) {
+  %add = add i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %add
+}
+
+
+
+define i32 @outer_sub1(i32 %a) {
+; CHECK-LABEL: @outer_sub1(
+; CHECK-NOT: call i32 @sub1
+  %C = call i32 @sub1(i32 %a, i32 0)
+  ret i32 %C
+}
+
+define i32 @sub1(i32 %a, i32 %b) {
+  %sub = sub i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %sub
+}
+
+
+define i32 @outer_sub2(i32 %a) {
+; CHECK-LABEL: @outer_sub2(
+; CHECK-NOT: call i32 @sub2
+  %C = call i32 @sub2(i32 %a)
+  ret i32 %C
+}
+
+define i32 @sub2(i32 %a) {
+  %sub = sub i32 %a, %a
+  call void @pad()
+  ret i32 %sub
+}
+
+
+
+define i32 @outer_mul1(i32 %a) {
+; CHECK-LABEL: @outer_mul1(
+; CHECK-NOT: call i32 @mul
+  %C = call i32 @mul(i32 %a, i32 0)
+  ret i32 %C
+}
+
+define i32 @outer_mul2(i32 %a) {
+; CHECK-LABEL: @outer_mul2(
+; CHECK-NOT: call i32 @mul
+  %C = call i32 @mul(i32 %a, i32 1)
+  ret i32 %C
+}
+
+define i32 @mul(i32 %a, i32 %b) {
+  %mul = mul i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %mul
+}
+
+
+
+define i32 @outer_div1(i32 %a) {
+; CHECK-LABEL: @outer_div1(
+; CHECK-NOT: call i32 @div1
+  %C = call i32 @div1(i32 0, i32 %a)
+  ret i32 %C
+}
+
+define i32 @outer_div2(i32 %a) {
+; CHECK-LABEL: @outer_div2(
+; CHECK-NOT: call i32 @div1
+  %C = call i32 @div1(i32 %a, i32 1)
+  ret i32 %C
+}
+
+define i32 @div1(i32 %a, i32 %b) {
+  %div = sdiv i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %div
+}
+
+
+define i32 @outer_div3(i32 %a) {
+; CHECK-LABEL: @outer_div3(
+; CHECK-NOT: call i32 @div
+  %C = call i32 @div2(i32 %a)
+  ret i32 %C
+}
+
+define i32 @div2(i32 %a) {
+  %div = sdiv i32 %a, %a
+  call void @pad()
+  ret i32 %div
+}
+
+
+
+define i32 @outer_rem1(i32 %a) {
+; CHECK-LABEL: @outer_rem1(
+; CHECK-NOT: call i32 @rem
+  %C = call i32 @rem1(i32 0, i32 %a)
+  ret i32 %C
+}
+
+define i32 @outer_rem2(i32 %a) {
+; CHECK-LABEL: @outer_rem2(
+; CHECK-NOT: call i32 @rem
+  %C = call i32 @rem1(i32 %a, i32 1)
+  ret i32 %C
+}
+
+define i32 @rem1(i32 %a, i32 %b) {
+  %rem = urem i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %rem
+}
+
+
+define i32 @outer_rem3(i32 %a) {
+; CHECK-LABEL: @outer_rem3(
+; CHECK-NOT: call i32 @rem
+  %C = call i32 @rem2(i32 %a)
+  ret i32 %C
+}
+
+define i32 @rem2(i32 %a) {
+  %rem = urem i32 %a, %a
+  call void @pad()
+  ret i32 %rem
+}
+
+
+
+define i32 @outer_shl1(i32 %a) {
+; CHECK-LABEL: @outer_shl1(
+; CHECK-NOT: call i32 @shl
+  %C = call i32 @shl(i32 %a, i32 0)
+  ret i32 %C
+}
+
+define i32 @shl(i32 %a, i32 %b) {
+  %shl = shl i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %shl
+}
+
+
+
+define i32 @outer_shr1(i32 %a) {
+; CHECK-LABEL: @outer_shr1(
+; CHECK-NOT: call i32 @shr
+  %C = call i32 @shr(i32 %a, i32 0)
+  ret i32 %C
+}
+
+define i32 @shr(i32 %a, i32 %b) {
+  %shr = ashr i32 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i32 %shr
+}
+
+
+
+define i1 @outer_and1(i1 %a) {
+; check-label: @outer_and1(
+; check-not: call i1 @and1
+  %c = call i1 @and1(i1 %a, i1 false)
+  ret i1 %c
+}
+
+define i1 @outer_and2(i1 %a) {
+; check-label: @outer_and2(
+; check-not: call i1 @and1
+  %c = call i1 @and1(i1 %a, i1 true)
+  ret i1 %c
+}
+
+define i1 @and1(i1 %a, i1 %b) {
+  %and = and i1 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i1 %and
+}
+
+
+define i1 @outer_and3(i1 %a) {
+; check-label: @outer_and3(
+; check-not: call i1 @and2
+  %c = call i1 @and2(i1 %a)
+  ret i1 %c
+}
+
+define i1 @and2(i1 %a) {
+  %and = and i1 %a, %a
+  call void @pad()
+  ret i1 %and
+}
+
+
+
+define i1 @outer_or1(i1 %a) {
+; check-label: @outer_or1(
+; check-not: call i1 @or1
+  %c = call i1 @or1(i1 %a, i1 false)
+  ret i1 %c
+}
+
+define i1 @outer_or2(i1 %a) {
+; check-label: @outer_or2(
+; check-not: call i1 @or1
+  %c = call i1 @or1(i1 %a, i1 true)
+  ret i1 %c
+}
+
+define i1 @or1(i1 %a, i1 %b) {
+  %or = or i1 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i1 %or
+}
+
+
+define i1 @outer_or3(i1 %a) {
+; check-label: @outer_or3(
+; check-not: call i1 @or2
+  %c = call i1 @or2(i1 %a)
+  ret i1 %c
+}
+
+define i1 @or2(i1 %a) {
+  %or = or i1 %a, %a
+  call void @pad()
+  ret i1 %or
+}
+
+
+
+define i1 @outer_xor1(i1 %a) {
+; check-label: @outer_xor1(
+; check-not: call i1 @xor
+  %c = call i1 @xor1(i1 %a, i1 false)
+  ret i1 %c
+}
+
+define i1 @xor1(i1 %a, i1 %b) {
+  %xor = xor i1 %a, %b
+  call void @pad()
+  store i32 0, i32* @glbl
+  ret i1 %xor
+}
+
+
+define i1 @outer_xor3(i1 %a) {
+; check-label: @outer_xor3(
+; check-not: call i1 @xor
+  %c = call i1 @xor2(i1 %a)
+  ret i1 %c
+}
+
+define i1 @xor2(i1 %a) {
+  %xor = xor i1 %a, %a
+  call void @pad()
+  ret i1 %xor
+}
diff --git a/test/Transforms/Inline/ARM/inline-fp.ll b/test/Transforms/Inline/ARM/inline-fp.ll
new file mode 100644
index 000000000000..b4e76dfc7d2d
--- /dev/null
+++ b/test/Transforms/Inline/ARM/inline-fp.ll
@@ -0,0 +1,113 @@
+; RUN: opt -S -inline -mtriple=arm-eabi -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=NOFP
+; RUN: opt -S -inline -mtriple=arm-eabi -mattr=+vfp2 -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=FULLFP
+; RUN: opt -S -inline -mtriple=arm-eabi -mattr=+vfp2,+fp-only-sp -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=SINGLEFP
+; Make sure that soft float implementations are calculated as being more expensive
+; to the inliner.
+
+; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
+; NOFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
+; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+
+; FULLFP-DAG: single inlined into test_single with cost=0 (threshold=75)
+; FULLFP-DAG: single inlined into test_single with cost=-15000 (threshold=75)
+; FULLFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
+; FULLFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
+; FULLFP-DAG: double inlined into test_double with cost=0 (threshold=75)
+; FULLFP-DAG: double inlined into test_double with cost=-15000 (threshold=75)
+; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+
+; SINGLEFP-DAG: single inlined into test_single with cost=0 (threshold=75)
+; SINGLEFP-DAG: single inlined into test_single with cost=-15000 (threshold=75)
+; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75)
+; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75)
+; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
+; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
+
+define i32 @test_single(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
+  %call = call float @single(i32 %a, i8 zeroext %b)
+  %call2 = call float @single(i32 %c, i8 zeroext %d)
+  ret i32 0
+}
+
+define i32 @test_single_cheap(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
+  %call = call float @single_cheap(i32 %a, i8 zeroext %b)
+  %call2 = call float @single_cheap(i32 %c, i8 zeroext %d)
+  ret i32 0
+}
+
+define i32 @test_double(i32 %a, i8 %b, i32 %c, i8 %d) #0 {
+  %call = call double @double(i32 %a, i8 zeroext %b)
+  %call2 = call double @double(i32 %c, i8 zeroext %d)
+  ret i32 0
+}
+
+define i32 @test_single_force_soft(i32 %a, i8 %b, i32 %c, i8 %d) #1 {
+  %call = call float @single_force_soft(i32 %a, i8 zeroext %b) #1
+  %call2 = call float @single_force_soft(i32 %c, i8 zeroext %d) #1
+  ret i32 0
+}
+
+define internal float @single(i32 %response, i8 zeroext %value1) #0 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to float
+  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+  %mul = fmul float %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to float
+  %sub3 = fsub float %conv2, %mul
+  %div = fdiv float %sub3, %mul
+  ret float %div
+}
+
+define internal float @single_cheap(i32 %response, i8 zeroext %value1) #0 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = bitcast i32 %sub to float
+  %conv2 = bitcast i32 %response to float
+  %0 = tail call float @llvm.pow.f32(float %conv2, float %conv1)
+  %1 = tail call float @llvm.pow.f32(float %0, float %0)
+  %2 = tail call float @llvm.pow.f32(float %1, float %1)
+  ret float %2
+}
+
+define internal double @double(i32 %response, i8 zeroext %value1) #0 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to double
+  %0 = tail call double @llvm.pow.f64(double 0x3FF028F5C0000000, double %conv1)
+  %mul = fmul double %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to double
+  %sub3 = fsub double %conv2, %mul
+  %div = fdiv double %sub3, %mul
+  ret double %div
+}
+
+define internal float @single_force_soft(i32 %response, i8 zeroext %value1) #1 {
+entry:
+  %conv = zext i8 %value1 to i32
+  %sub = add nsw i32 %conv, -1
+  %conv1 = sitofp i32 %sub to float
+  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
+  %mul = fmul float %0, 2.620000e+03
+  %conv2 = sitofp i32 %response to float
+  %sub3 = fsub float %conv2, %mul
+  %div = fdiv float %sub3, %mul
+  ret float %div
+}
+
+declare float @llvm.pow.f32(float, float) optsize minsize
+declare double @llvm.pow.f64(double, double) optsize minsize
+
+attributes #0 = { optsize }
+attributes #1 = { optsize "use-soft-float"="true" "target-features"="+soft-float" }
diff --git a/test/Transforms/Inline/inline-fp.ll b/test/Transforms/Inline/inline-fp.ll
deleted file mode 100644
index dd5972fe1b8a..000000000000
--- a/test/Transforms/Inline/inline-fp.ll
+++ /dev/null
@@ -1,137 +0,0 @@
-; RUN: opt -S -inline < %s | FileCheck %s
-; RUN: opt -S -passes='cgscc(inline)' < %s | FileCheck %s
-; Make sure that soft float implementations are calculated as being more expensive
-; to the inliner.
-
-define i32 @test_nofp() #0 {
-; f_nofp() has the "use-soft-float" attribute, so it should never get inlined.
-; CHECK-LABEL: test_nofp
-; CHECK: call float @f_nofp 
-entry:
-  %responseX = alloca i32, align 4
-  %responseY = alloca i32, align 4
-  %responseZ = alloca i32, align 4
-  %valueX = alloca i8, align 1
-  %valueY = alloca i8, align 1
-  %valueZ = alloca i8, align 1
-
-  call void @getX(i32* %responseX, i8* %valueX)
-  call void @getY(i32* %responseY, i8* %valueY)
-  call void @getZ(i32* %responseZ, i8* %valueZ)
-
-  %0 = load i32, i32* %responseX
-  %1 = load i8, i8* %valueX
-  %call = call float @f_nofp(i32 %0, i8 zeroext %1)
-  %2 = load i32, i32* %responseZ
-  %3 = load i8, i8* %valueZ
-  %call2 = call float @f_nofp(i32 %2, i8 zeroext %3)
-  %call3 = call float @fabsf(float %call)
-  %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
-  br i1 %cmp, label %if.end12, label %if.else
-
-if.else:                                          ; preds = %entry
-  %4 = load i32, i32* %responseY
-  %5 = load i8, i8* %valueY
-  %call1 = call float @f_nofp(i32 %4, i8 zeroext %5)
-  %call4 = call float @fabsf(float %call1)
-  %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
-  br i1 %cmp5, label %if.end12, label %if.else7
-
-if.else7:                                         ; preds = %if.else
-  %call8 = call float @fabsf(float %call2)
-  %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
-  br i1 %cmp9, label %if.then10, label %if.end12
-
-if.then10:                                        ; preds = %if.else7
-  br label %if.end12
-
-if.end12:                                         ; preds = %if.else, %entry, %if.then10, %if.else7
-  %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
-  ret i32 %success.0
-}
-
-define i32 @test_hasfp() #0 {
-; f_hasfp()  does not have the "use-soft-float" attribute, so it should get inlined.
-; CHECK-LABEL: test_hasfp
-; CHECK-NOT: call float @f_hasfp 
-entry:
-  %responseX = alloca i32, align 4
-  %responseY = alloca i32, align 4
-  %responseZ = alloca i32, align 4
-  %valueX = alloca i8, align 1
-  %valueY = alloca i8, align 1
-  %valueZ = alloca i8, align 1
-
-  call void @getX(i32* %responseX, i8* %valueX)
-  call void @getY(i32* %responseY, i8* %valueY)
-  call void @getZ(i32* %responseZ, i8* %valueZ)
-
-  %0 = load i32, i32* %responseX
-  %1 = load i8, i8* %valueX
-  %call = call float @f_hasfp(i32 %0, i8 zeroext %1)
-  %2 = load i32, i32* %responseZ
-  %3 = load i8, i8* %valueZ
-  %call2 = call float @f_hasfp(i32 %2, i8 zeroext %3)
-  %call3 = call float @fabsf(float %call)
-  %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000
-  br i1 %cmp, label %if.end12, label %if.else
-
-if.else:                                          ; preds = %entry
-  %4 = load i32, i32* %responseY
-  %5 = load i8, i8* %valueY
-  %call1 = call float @f_hasfp(i32 %4, i8 zeroext %5)
-  %call4 = call float @fabsf(float %call1)
-  %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000
-  br i1 %cmp5, label %if.end12, label %if.else7
-
-if.else7:                                         ; preds = %if.else
-  %call8 = call float @fabsf(float %call2)
-  %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000
-  br i1 %cmp9, label %if.then10, label %if.end12
-
-if.then10:                                        ; preds = %if.else7
-  br label %if.end12
-
-if.end12:                                         ; preds = %if.else, %entry, %if.then10, %if.else7
-  %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ]
-  ret i32 %success.0
-}
-
-declare void @getX(i32*, i8*) #0
-
-declare void @getY(i32*, i8*) #0
-
-declare void @getZ(i32*, i8*) #0
-
-define internal float @f_hasfp(i32 %response, i8 zeroext %value1) #0 {
-entry:
-  %conv = zext i8 %value1 to i32
-  %sub = add nsw i32 %conv, -1
-  %conv1 = sitofp i32 %sub to float
-  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
-  %mul = fmul float %0, 2.620000e+03
-  %conv2 = sitofp i32 %response to float
-  %sub3 = fsub float %conv2, %mul
-  %div = fdiv float %sub3, %mul
-  ret float %div
-}
-
-define internal float @f_nofp(i32 %response, i8 zeroext %value1) #1 {
-entry:
-  %conv = zext i8 %value1 to i32
-  %sub = add nsw i32 %conv, -1
-  %conv1 = sitofp i32 %sub to float
-  %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1)
-  %mul = fmul float %0, 2.620000e+03
-  %conv2 = sitofp i32 %response to float
-  %sub3 = fsub float %conv2, %mul
-  %div = fdiv float %sub3, %mul
-  ret float %div
-}
-
-declare float @fabsf(float) optsize minsize
-
-declare float @llvm.pow.f32(float, float) optsize minsize
-
-attributes #0 = { optsize }
-attributes #1 = { optsize "use-soft-float"="true" }
diff --git a/test/Transforms/Inline/redundant-loads.ll b/test/Transforms/Inline/redundant-loads.ll
index 6b89f1db484b..176f605fc73b 100644
--- a/test/Transforms/Inline/redundant-loads.ll
+++ b/test/Transforms/Inline/redundant-loads.ll
@@ -184,3 +184,21 @@ define void @inner9(i32* %a, void ()* %f) {
   call void @pad()
   ret void
 }
+
+
+define void @outer10(i32* %a) {
+; CHECK-LABEL: @outer10(
+; CHECK: call void @inner10
+  %b = alloca i32
+  call void @inner10(i32* %a, i32* %b)
+  ret void
+}
+
+define void @inner10(i32* %a, i32* %b) {
+  %1 = load i32, i32* %a
+  store i32 %1, i32 * %b
+  %2 = load volatile i32, i32* %a ; volatile load should be kept.
+  call void @pad()
+  %3 = load volatile i32, i32* %a ; Same as the above.
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2011-09-03-Trampoline.ll b/test/Transforms/InstCombine/2011-09-03-Trampoline.ll
index 1833558cbceb..7a315094a04e 100644
--- a/test/Transforms/InstCombine/2011-09-03-Trampoline.ll
+++ b/test/Transforms/InstCombine/2011-09-03-Trampoline.ll
@@ -5,18 +5,18 @@ declare i8* @llvm.adjust.trampoline(i8*)
 declare i32 @f(i8 * nest, i32)
 
 ; Most common case
-define i32 @test0(i32 %n) {
+define i32 @test0(i32 %n) !dbg !4 {
   %alloca = alloca [10 x i8], align 16
   %gep = getelementptr [10 x i8], [10 x i8]* %alloca, i32 0, i32 0
   call void @llvm.init.trampoline(i8* %gep, i8* bitcast (i32 (i8*, i32)* @f to i8*),
                                   i8* null)
   %tramp = call i8* @llvm.adjust.trampoline(i8* %gep)
   %function = bitcast i8* %tramp to i32(i32)*
-  %ret = call i32 %function(i32 %n)
+  %ret = call i32 %function(i32 %n), !dbg !10
   ret i32 %ret
 
-; CHECK: define i32 @test0(i32 %n) {
-; CHECK: %ret = call i32 @f(i8* nest null, i32 %n)
+; CHECK: define i32 @test0(i32 %n) !dbg !4 {
+; CHECK: %ret = call i32 @f(i8* nest null, i32 %n), !dbg !10
 }
 
 define i32 @test1(i32 %n, i8* %trampmem) {
@@ -85,3 +85,18 @@ define i32 @test4(i32 %n) {
 ; CHECK: %ret1 = call i32 @f(i8* nest null, i32 %n)
 ; CHECK: %ret2 = call i32 @f(i8* nest null, i32 %n)
 }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.0 (trunk 127710)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2)
+!1 = !DIFile(filename: "string.h", directory: "Game")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "passthru", scope: !1, file: !1, line: 79, type: !5, isLocal: true, isDefinition: true, scopeLine: 79, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !0, baseType: null, size: 64, align: 64)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 78, type: !7)
+!10 = !DILocation(line: 78, column: 28, scope: !4)
diff --git a/test/Transforms/JumpThreading/guards.ll b/test/Transforms/JumpThreading/guards.ll
index 53175a7b7253..c760283f9e52 100644
--- a/test/Transforms/JumpThreading/guards.ll
+++ b/test/Transforms/JumpThreading/guards.ll
@@ -278,3 +278,106 @@ L2:
 L3:
   ret void
 }
+
+; Make sure that we don't PRE a non-speculable load across a guard.
+define void @unsafe_pre_across_guard(i8* %p, i1 %load.is.valid) {
+
+; CHECK-LABEL: @unsafe_pre_across_guard(
+; CHECK-NOT:   loaded.pr
+; CHECK:       entry:
+; CHECK-NEXT:    br label %loop
+; CHECK:       loop:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+; CHECK-NEXT:    %loaded = load i8, i8* %p
+; CHECK-NEXT:    %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT:    br i1 %continue, label %exit, label %loop
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+  %loaded = load i8, i8* %p
+  %continue = icmp eq i8 %loaded, 0
+  br i1 %continue, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+; Make sure that we can safely PRE a speculable load across a guard.
+define void @safe_pre_across_guard(i8* noalias nocapture readonly dereferenceable(8) %p, i1 %load.is.valid) {
+
+; CHECK-LABEL: @safe_pre_across_guard(
+; CHECK:       entry:
+; CHECK-NEXT:    %loaded.pr = load i8, i8* %p
+; CHECK-NEXT:    br label %loop
+; CHECK:       loop:
+; CHECK-NEXT:    %loaded = phi i8 [ %loaded, %loop ], [ %loaded.pr, %entry ]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+; CHECK-NEXT:    %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT:    br i1 %continue, label %exit, label %loop
+
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ]
+  %loaded = load i8, i8* %p
+  %continue = icmp eq i8 %loaded, 0
+  br i1 %continue, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+; Make sure that we don't PRE a non-speculable load across a call which may
+; alias with the load.
+define void @unsafe_pre_across_call(i8* %p) {
+
+; CHECK-LABEL: @unsafe_pre_across_call(
+; CHECK-NOT:   loaded.pr
+; CHECK:       entry:
+; CHECK-NEXT:    br label %loop
+; CHECK:       loop:
+; CHECK-NEXT:    call i32 @f1()
+; CHECK-NEXT:    %loaded = load i8, i8* %p
+; CHECK-NEXT:    %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT:    br i1 %continue, label %exit, label %loop
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call i32 @f1()
+  %loaded = load i8, i8* %p
+  %continue = icmp eq i8 %loaded, 0
+  br i1 %continue, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+; Make sure that we can safely PRE a speculable load across a call.
+define void @safe_pre_across_call(i8* noalias nocapture readonly dereferenceable(8) %p) {
+
+; CHECK-LABEL: @safe_pre_across_call(
+; CHECK:       entry:
+; CHECK-NEXT:    %loaded.pr = load i8, i8* %p
+; CHECK-NEXT:    br label %loop
+; CHECK:       loop:
+; CHECK-NEXT:    %loaded = phi i8 [ %loaded, %loop ], [ %loaded.pr, %entry ]
+; CHECK-NEXT:    call i32 @f1()
+; CHECK-NEXT:    %continue = icmp eq i8 %loaded, 0
+; CHECK-NEXT:    br i1 %continue, label %exit, label %loop
+
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call i32 @f1()
+  %loaded = load i8, i8* %p
+  %continue = icmp eq i8 %loaded, 0
+  br i1 %continue, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/legal_preheader_check.ll b/test/Transforms/LoopVectorize/legal_preheader_check.ll
new file mode 100644
index 000000000000..32aa796394d6
--- /dev/null
+++ b/test/Transforms/LoopVectorize/legal_preheader_check.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -debug -S -o /dev/null 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; D40973
+; Make sure LV legal bails out when the loop doesn't have a legal pre-header.
+
+; CHECK: LV: Loop doesn't have a legal pre-header.
+
+define void @inc(i32 %n, i8* %P) {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %BB1, label %BB2
+
+BB1:
+  indirectbr i8* %P, [label %.lr.ph]
+
+BB2:
+  br label %.lr.ph
+
+.lr.ph:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %BB1 ], [ 0, %BB2 ]
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:
+  ret void
+}
diff --git a/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll b/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
new file mode 100644
index 000000000000..e3d1f6dd2b17
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Test memcpy-memcpy dependencies across invoke edges.
+
+; Test that memcpyopt works across the non-unwind edge of an invoke.
+
+define hidden void @test_normal(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %temp = alloca i8, i32 64
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+  invoke void @invoke_me()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  landingpad { i8*, i32 }
+          catch i8* null
+  ret void
+
+try.cont:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
+  ret void
+}
+
+; Test that memcpyopt works across the unwind edge of an invoke.
+
+define hidden void @test_unwind(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %temp = alloca i8, i32 64
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+  invoke void @invoke_me()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  landingpad { i8*, i32 }
+          catch i8* null
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
+  ret void
+
+try.cont:
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+declare i32 @__gxx_personality_v0(...)
+declare void @invoke_me() readnone
diff --git a/test/Transforms/MemCpyOpt/merge-into-memset.ll b/test/Transforms/MemCpyOpt/merge-into-memset.ll
new file mode 100644
index 000000000000..fc31038a4e6d
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/merge-into-memset.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Update cached non-local dependence information when merging stores into memset.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Don't delete the memcpy in %if.then, even though it depends on an instruction
+; which will be deleted.
+
+; CHECK-LABEL: @foo
+define void @foo(i1 %c, i8* %d, i8* %e, i8* %f) {
+entry:
+  %tmp = alloca [50 x i8], align 8
+  %tmp4 = bitcast [50 x i8]* %tmp to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp4, i64 1
+  call void @llvm.memset.p0i8.i64(i8* nonnull %d, i8 0, i64 10, i32 1, i1 false), !dbg !5
+  store i8 0, i8* %tmp4, align 8, !dbg !5
+; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull %d, i8 0, i64 10, i32 1, i1 false), !dbg !5
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %tmp1, i8* nonnull %d, i64 10, i32 1, i1 false)
+  br i1 %c, label %if.then, label %exit
+
+if.then:
+; CHECK: if.then:
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %f, i8* nonnull %tmp4, i64 30, i32 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %f, i8* nonnull %tmp4, i64 30, i32 8, i1 false)
+  br label %exit
+
+exit:
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "t.rs", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !DILocation(line: 8, column: 5, scope: !6)
+!6 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !7, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null}
diff --git a/test/Transforms/MemCpyOpt/mixed-sizes.ll b/test/Transforms/MemCpyOpt/mixed-sizes.ll
new file mode 100644
index 000000000000..9091fe7f56c0
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/mixed-sizes.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Handle memcpy-memcpy dependencies of differing sizes correctly.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Don't delete the second memcpy, even though there's an earlier
+; memcpy with a larger size from the same address.
+
+; CHECK-LABEL: @foo
+define i32 @foo(i1 %z) {
+entry:
+  %a = alloca [10 x i32]
+  %s = alloca [10 x i32]
+  %0 = bitcast [10 x i32]* %a to i8*
+  %1 = bitcast [10 x i32]* %s to i8*
+  call void @llvm.memset.p0i8.i64(i8* nonnull %1, i8 0, i64 40, i32 16, i1 false)
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 0
+  store i32 1, i32* %arrayidx
+  %scevgep = getelementptr [10 x i32], [10 x i32]* %s, i64 0, i64 1
+  %scevgep7 = bitcast i32* %scevgep to i8*
+  br i1 %z, label %for.body3.lr.ph, label %for.inc7.1
+
+for.body3.lr.ph:                                  ; preds = %entry
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 17179869180, i32 4, i1 false)
+  br label %for.inc7.1
+
+for.inc7.1:
+; CHECK: for.inc7.1:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 4, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 4, i32 4, i1 false)
+  %2 = load i32, i32* %arrayidx
+  ret i32 %2
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)
diff --git a/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll b/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
new file mode 100644
index 000000000000..5b0510211d9f
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
@@ -0,0 +1,114 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+; Make sure memcpy-memcpy dependence is optimized across
+; basic blocks (conditional branches and invokes).
+
+%struct.s = type { i32, i32 }
+
+@s_foo = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4
+@s_baz = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4
+@i = external constant i8*
+
+declare void @qux()
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare i8* @__cxa_begin_catch(i8*)
+
+; A simple partial redundancy. Test that the second memcpy is optimized
+; to copy directly from the original source rather than from the temporary.
+
+; CHECK-LABEL: @wobble
+define void @wobble(i8* noalias %dst, i8* %src, i1 %some_condition) {
+bb:
+  %temp = alloca i8, i32 64
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false)
+  br i1 %some_condition, label %more, label %out
+
+out:
+  call void @qux()
+  unreachable
+
+more:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false)
+  ret void
+}
+
+; A CFG triangle with a partial redundancy targeting an alloca. Test that the
+; memcpy inside the triangle is optimized to copy directly from the original
+; source rather than from the temporary.
+
+; CHECK-LABEL: @foo
+define i32 @foo(i1 %t3) {
+bb:
+  %s = alloca %struct.s, align 4
+  %t = alloca %struct.s, align 4
+  %s1 = bitcast %struct.s* %s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s1, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s1, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
+  br i1 %t3, label %bb4, label %bb7
+
+bb4:                                              ; preds = %bb
+  %t5 = bitcast %struct.s* %t to i8*
+  %s6 = bitcast %struct.s* %s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t5, i8* %s6, i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t5, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false)
+  br label %bb7
+
+bb7:                                              ; preds = %bb4, %bb
+  %t8 = getelementptr %struct.s, %struct.s* %t, i32 0, i32 0
+  %t9 = load i32, i32* %t8, align 4
+  %t10 = getelementptr %struct.s, %struct.s* %t, i32 0, i32 1
+  %t11 = load i32, i32* %t10, align 4
+  %t12 = add i32 %t9, %t11
+  ret i32 %t12
+}
+
+; A CFG diamond with an invoke on one side, and a partially redundant memcpy
+; into an alloca on the other. Test that the memcpy inside the diamond is
+; optimized to copy ; directly from the original source rather than from the
+; temporary. This more complex test represents a relatively common usage
+; pattern.
+
+; CHECK-LABEL: @baz
+define i32 @baz(i1 %t5) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+bb:
+  %s = alloca %struct.s, align 4
+  %t = alloca %struct.s, align 4
+  %s3 = bitcast %struct.s* %s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s3, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s3, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
+  br i1 %t5, label %bb6, label %bb22
+
+bb6:                                              ; preds = %bb
+  invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null)
+          to label %bb25 unwind label %bb9
+
+bb9:                                              ; preds = %bb6
+  %t10 = landingpad { i8*, i32 }
+          catch i8* null
+  br label %bb13
+
+bb13:                                             ; preds = %bb9
+  %t15 = call i8* @__cxa_begin_catch(i8* null)
+  br label %bb23
+
+bb22:                                             ; preds = %bb
+  %t23 = bitcast %struct.s* %t to i8*
+  %s24 = bitcast %struct.s* %s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t23, i8* %s24, i64 8, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t23, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false)
+  br label %bb23
+
+bb23:                                             ; preds = %bb22, %bb13
+  %t17 = getelementptr inbounds %struct.s, %struct.s* %t, i32 0, i32 0
+  %t18 = load i32, i32* %t17, align 4
+  %t19 = getelementptr inbounds %struct.s, %struct.s* %t, i32 0, i32 1
+  %t20 = load i32, i32* %t19, align 4
+  %t21 = add nsw i32 %t18, %t20
+  ret i32 %t21
+
+bb25:                                             ; preds = %bb6
+  unreachable
+}
diff --git a/test/Transforms/NewGVN/tbaa.ll b/test/Transforms/NewGVN/tbaa.ll
index 3dcc4f8acc14..d48ededac03a 100644
--- a/test/Transforms/NewGVN/tbaa.ll
+++ b/test/Transforms/NewGVN/tbaa.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -tbaa -basicaa -newgvn -S < %s | FileCheck %s
 
 define i32 @test1(i8* %p, i8* %q) {
-; CHECK: @test1(i8* %p, i8* %q)
+; CHECK-LABEL: @test1(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p)
 ; CHECK-NOT: tbaa
 ; CHECK: %c = add i32 %a, %a
@@ -12,7 +12,7 @@ define i32 @test1(i8* %p, i8* %q) {
 }
 
 define i32 @test2(i8* %p, i8* %q) {
-; CHECK: @test2(i8* %p, i8* %q)
+; CHECK-LABEL: @test2(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGC:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
@@ -22,7 +22,7 @@ define i32 @test2(i8* %p, i8* %q) {
 }
 
 define i32 @test3(i8* %p, i8* %q) {
-; CHECK: @test3(i8* %p, i8* %q)
+; CHECK-LABEL: @test3(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGB:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !3
@@ -32,7 +32,7 @@ define i32 @test3(i8* %p, i8* %q) {
 }
 
 define i32 @test4(i8* %p, i8* %q) {
-; CHECK: @test4(i8* %p, i8* %q)
+; CHECK-LABEL: @test4(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !1
@@ -42,8 +42,8 @@ define i32 @test4(i8* %p, i8* %q) {
 }
 
 define i32 @test5(i8* %p, i8* %q) {
-; CHECK: @test5(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test5(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !1
@@ -52,8 +52,8 @@ define i32 @test5(i8* %p, i8* %q) {
 }
 
 define i32 @test6(i8* %p, i8* %q) {
-; CHECK: @test6(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
+; CHECK-LABEL: @test6(i8* %p, i8* %q)
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !3
@@ -62,7 +62,7 @@ define i32 @test6(i8* %p, i8* %q) {
 }
 
 define i32 @test7(i8* %p, i8* %q) {
-; CHECK: @test7(i8* %p, i8* %q)
+; CHECK-LABEL: @test7(i8* %p, i8* %q)
 ; CHECK: call i32 @foo(i8* %p)
 ; CHECK-NOT: tbaa
 ; CHECK: %c = add i32 %a, %a
@@ -72,10 +72,8 @@ define i32 @test7(i8* %p, i8* %q) {
   ret i32 %c
 }
 
-
-
 define i32 @test8(i32* %p, i32* %q) {
-; CHECK-LABEL: test8
+; CHECK-LABEL: @test8
 ; CHECK-NEXT: store i32 15, i32* %p
 ; CHECK-NEXT: ret i32 0
 ; Since we know the location is invariant, we can forward the
@@ -87,8 +85,9 @@ define i32 @test8(i32* %p, i32* %q) {
   %c = sub i32 %a, %b
   ret i32 %c
 }
+
 define i32 @test9(i32* %p, i32* %q) {
-; CHECK-LABEL: test9
+; CHECK-LABEL: @test9
 ; CHECK-NEXT: call void @clobber()
 ; CHECK-NEXT: ret i32 0
 ; Since we know the location is invariant, we can forward the
@@ -101,16 +100,27 @@ define i32 @test9(i32* %p, i32* %q) {
   ret i32 %c
 }
 
+define i32 @test10(i8* %p, i8* %q) {
+; If one access encloses the other, then the merged access is the enclosed one
+; and not just the common final access type.
+; CHECK-LABEL: @test10
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAG_X_i:!.*]]
+; CHECK: %c = add i32 %a, %a
+  %a = call i32 @foo(i8* %p), !tbaa !15  ; TAG_X_i
+  %b = call i32 @foo(i8* %p), !tbaa !19  ; TAG_Y_x_i
+  %c = add i32 %a, %b
+  ret i32 %c
+}
 
 declare void @clobber()
 declare i32 @foo(i8*) readonly
 
-; CHECK: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
-; CHECK: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
-; CHECK: [[TYPEA]] = !{!"A", !{{.*}}}
-; CHECK: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
-; CHECK: [[TYPEB]] = !{!"B", [[TYPEA]]}
-; CHECK: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
+; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
+; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
+; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}}
+; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
+; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]}
+; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
 !0 = !{!5, !5, i64 0}
 !1 = !{!6, !6, i64 0}
 !2 = !{!"tbaa root"}
@@ -122,8 +132,17 @@ declare i32 @foo(i8*) readonly
 !8 = !{!"another root"}
 !11 = !{!"scalar type", !8}
 
+; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0}
+; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0}
+; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0}
+!15 = !{!16, !17, i64 0}            ; TAG_X_i
+!16 = !{!"struct X", !17, i64 0}    ; struct X { int i; };
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"char", !2, i64 0}
 
-;; A TBAA structure who's only point is to have a constant location
+!19 = !{!20, !17, i64 0}            ; TAG_Y_x_i
+!20 = !{!"struct Y", !16, i64 0}    ; struct Y { struct X x; };
+
+; A TBAA structure who's only point is to have a constant location.
 !9 = !{!"yet another root"}
 !10 = !{!"node", !9, i64 1}
-
diff --git a/test/Transforms/PGOProfile/icp_covariant_call_return.ll b/test/Transforms/PGOProfile/icp_covariant_call_return.ll
index fc5054e3a574..aba075461deb 100644
--- a/test/Transforms/PGOProfile/icp_covariant_call_return.ll
+++ b/test/Transforms/PGOProfile/icp_covariant_call_return.ll
@@ -22,8 +22,7 @@ entry:
   %vtable = load %struct.Base* (%struct.B*)**, %struct.Base* (%struct.B*)*** %tmp2, align 8
   %vfn = getelementptr inbounds %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vtable, i64 0
   %tmp3 = load %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vfn, align 8
-; ICALL-PROM:  [[BITCAST:%[0-9]+]] = bitcast %struct.Base* (%struct.B*)* %tmp3 to i8*
-; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to i8*)
+; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq %struct.Base* (%struct.B*)* %tmp3, bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to %struct.Base* (%struct.B*)*)
 ; ICALL-PROM:  br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICALL-PROM:if.true.direct_targ:
 ; ICALL-PROM:  [[ARG_BITCAST:%[0-9]+]] = bitcast %struct.B* %tmp1 to %struct.D*
diff --git a/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll b/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll
index d2ff47dda0e6..0a4444783eb0 100644
--- a/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll
+++ b/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll
@@ -32,18 +32,19 @@ invoke.cont:
   %vtable = load %struct.Base* (%struct.B*)**, %struct.Base* (%struct.B*)*** %tmp2, align 8
   %vfn = getelementptr inbounds %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vtable, i64 0
   %tmp3 = load %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vfn, align 8
-; ICALL-PROM:  [[BITCAST:%[0-9]+]] = bitcast %struct.Base* (%struct.B*)* %tmp3 to i8*
-; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to i8*)
+; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq %struct.Base* (%struct.B*)* %tmp3, bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to %struct.Base* (%struct.B*)*)
 ; ICALL-PROM:  br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICALL-PROM:if.true.direct_targ:
 ; ICALL-PROM:  [[ARG_BITCAST:%[0-9]+]] = bitcast %struct.B* %tmp1 to %struct.D*
 ; ICALL-PROM:  [[DIRCALL_RET:%[0-9]+]] = invoke %struct.Derived* @_ZN1D4funcEv(%struct.D* [[ARG_BITCAST]])
-; ICALL-PROM:          to label %if.end.icp unwind label %lpad
+; ICALL-PROM:          to label %if.true.direct_targ.if.end.icp_crit_edge unwind label %lpad
+; ICALL-PROM:if.true.direct_targ.if.end.icp_crit_edge:
+; ICALL-PROM:  [[DIRCALL_RET_CAST:%[0-9]+]] = bitcast %struct.Derived* [[DIRCALL_RET]] to %struct.Base*
+; ICALL-PROM:  br label %if.end.icp
 ; ICALL-PROM:if.false.orig_indirect:
 ; ICAll-PROM:  %call2 = invoke %struct.Base* %tmp3(%struct.B* %tmp1)
 ; ICAll-PROM:          to label %invoke.cont1 unwind label %lpad
 ; ICALL-PROM:if.end.icp:
-; ICALL-PROM:  [[DIRCALL_RET_CAST:%[0-9]+]] = bitcast %struct.Derived* [[DIRCALL_RET]] to %struct.Base*
 ; ICALL-PROM:  br label %invoke.cont1
   %call2 = invoke %struct.Base* %tmp3(%struct.B* %tmp1)
           to label %invoke.cont1 unwind label %lpad, !prof !1
diff --git a/test/Transforms/PGOProfile/icp_invoke.ll b/test/Transforms/PGOProfile/icp_invoke.ll
index 2ec564627aa1..1cacc1bc1aca 100644
--- a/test/Transforms/PGOProfile/icp_invoke.ll
+++ b/test/Transforms/PGOProfile/icp_invoke.ll
@@ -20,8 +20,7 @@ entry:
 define i32 @_Z3goov() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 entry:
   %tmp = load void ()*, void ()** @foo1, align 8
-; ICP:  [[BITCAST_IC1:%[0-9]+]] = bitcast void ()* %tmp to i8*
-; ICP:  [[CMP_IC1:%[0-9]+]] = icmp eq i8* [[BITCAST_IC1]], bitcast (void ()* @_ZL4bar1v to i8*)
+; ICP:  [[CMP_IC1:%[0-9]+]] = icmp eq void ()* %tmp, @_ZL4bar1v
 ; ICP:  br i1 [[CMP_IC1]], label %[[TRUE_LABEL_IC1:.*]], label %[[FALSE_LABEL_IC1:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICP:[[TRUE_LABEL_IC1]]:
 ; ICP:  invoke void @_ZL4bar1v()
@@ -49,17 +48,19 @@ catch:
 
 try.cont:
   %tmp6 = load i32 ()*, i32 ()** @foo2, align 8
-; ICP:  [[BITCAST_IC2:%[0-9]+]] = bitcast i32 ()* %tmp6 to i8*
-; ICP:  [[CMP_IC2:%[0-9]+]] = icmp eq i8* [[BITCAST_IC2]], bitcast (i32 ()* @_ZL4bar2v to i8*)
+; ICP:  [[CMP_IC2:%[0-9]+]] = icmp eq i32 ()* %tmp6, @_ZL4bar2v
 ; ICP:  br i1 [[CMP_IC2]], label %[[TRUE_LABEL_IC2:.*]], label %[[FALSE_LABEL_IC2:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICP:[[TRUE_LABEL_IC2]]:
-; ICP:  [[RESULT_IC2:%[0-9]+]] = invoke i32 @_ZL4bar2v()
-; ICP:          to label %[[DCALL_NORMAL_DEST_IC2:.*]] unwind label %lpad1
+; ICP:  [[RESULT_IC2_0:%[0-9]+]] = invoke i32 @_ZL4bar2v()
+; ICP:          to label %[[MERGE_BB:.*]] unwind label %lpad1
 ; ICP:[[FALSE_LABEL_IC2]]:
+; ICP:  [[RESULT_IC2_1:%.+]] = invoke i32 %tmp6()
+; ICP:          to label %[[MERGE_BB]] unwind label %lpad1
   %call = invoke i32 %tmp6()
           to label %try.cont8 unwind label %lpad1, !prof !3
 
-; ICP:[[DCALL_NORMAL_DEST_IC2]]:
+; ICP:[[MERGE_BB]]:
+; ICP:  [[MERGE_PHI:%.+]] = phi i32 [ [[RESULT_IC2_1]], %[[FALSE_LABEL_IC2]] ], [ [[RESULT_IC2_0]], %[[TRUE_LABEL_IC2]] ]
 ; ICP:  br label %try.cont8
 lpad1:
   %tmp7 = landingpad { i8*, i32 }
@@ -77,7 +78,7 @@ catch6:
 
 try.cont8:
   %i.0 = phi i32 [ undef, %catch6 ], [ %call, %try.cont ]
-; ICP:  %i.0 = phi i32 [ undef, %catch6 ], [ %call, %[[FALSE_LABEL_IC2]] ], [ [[RESULT_IC2]], %[[DCALL_NORMAL_DEST_IC2]] ]
+; ICP:  %i.0 = phi i32 [ undef, %catch6 ], [ [[MERGE_PHI]], %[[MERGE_BB]] ]
   ret i32 %i.0
 
 eh.resume:
diff --git a/test/Transforms/PGOProfile/icp_invoke_nouse.ll b/test/Transforms/PGOProfile/icp_invoke_nouse.ll
index 5a1e6358cb61..096d2e0f222e 100644
--- a/test/Transforms/PGOProfile/icp_invoke_nouse.ll
+++ b/test/Transforms/PGOProfile/icp_invoke_nouse.ll
@@ -18,8 +18,7 @@ entry:
 
 if.end:                                           ; preds = %entry
   %fptr = load i32 ()*, i32 ()** @pfptr, align 8
-; ICP:  [[BITCAST_IC1:%[0-9]+]] = bitcast i32 ()* %fptr to i8*
-; ICP:  [[CMP_IC1:%[0-9]+]] = icmp eq i8* [[BITCAST_IC1]], bitcast (i32 ()* @_ZL4bar1v to i8*)
+; ICP:  [[CMP_IC1:%[0-9]+]] = icmp eq i32 ()* %fptr, @_ZL4bar1v
 ; ICP:  br i1 [[CMP_IC1]], label %[[TRUE_LABEL_IC1:.*]], label %[[FALSE_LABEL_IC1:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICP:[[TRUE_LABEL_IC1]]:
 ; ICP:  invoke i32 @_ZL4bar1v()
diff --git a/test/Transforms/PGOProfile/icp_vararg.ll b/test/Transforms/PGOProfile/icp_vararg.ll
index 400aab3aead7..ec243470290a 100644
--- a/test/Transforms/PGOProfile/icp_vararg.ll
+++ b/test/Transforms/PGOProfile/icp_vararg.ll
@@ -13,8 +13,7 @@ entry:
 define i32 @bar() #1 {
 entry:
   %tmp = load i32 (i32, ...)*, i32 (i32, ...)** @foo, align 8
-; ICALL-PROM:  [[BITCAST:%[0-9]+]] = bitcast i32 (i32, ...)* %tmp to i8*
-; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (i32 (i32, ...)* @va_func to i8*)
+; ICALL-PROM:  [[CMP:%[0-9]+]] = icmp eq i32 (i32, ...)* %tmp, @va_func
 ; ICALL-PROM:  br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICALL-PROM:if.true.direct_targ:
 ; ICALL-PROM:  [[DIRCALL_RET:%[0-9]+]] = call i32 (i32, ...) @va_func(i32 3, i32 12, i32 22, i32 4)
diff --git a/test/Transforms/PGOProfile/indirect_call_promotion.ll b/test/Transforms/PGOProfile/indirect_call_promotion.ll
index 6832fecfaed3..85df5260f199 100644
--- a/test/Transforms/PGOProfile/indirect_call_promotion.ll
+++ b/test/Transforms/PGOProfile/indirect_call_promotion.ll
@@ -43,8 +43,7 @@ entry:
 define i32 @bar() {
 entry:
   %tmp = load i32 ()*, i32 ()** @foo, align 8
-; ICALL-PROM:   [[BITCAST:%[0-9]+]] = bitcast i32 ()* %tmp to i8*
-; ICALL-PROM:   [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (i32 ()* @func4 to i8*)
+; ICALL-PROM:   [[CMP:%[0-9]+]] = icmp eq i32 ()* %tmp, @func4
 ; ICALL-PROM:   br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICALL-PROM: if.true.direct_targ:
 ; ICALL-PROM:   [[DIRCALL_RET:%[0-9]+]] = call i32 @func4()
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
index 4def8ce561c0..557a83a75626 100644
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
@@ -11,16 +11,20 @@
     define i32 @fn1() {
 ; CHECK-LABEL: @fn1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 8, i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP7]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 2), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 8, i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   entry:
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
deleted file mode 100644
index 5fc0298b6cef..000000000000
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
-
-
-;void jumble (int * restrict A, int * restrict B) {
-  ;  int tmp0 = A[10]*A[0];
-  ;  int tmp1 = A[11]*A[1];
-  ;  int tmp2 = A[12]*A[3];
-  ;  int tmp3 = A[13]*A[2];
-  ;  B[0] = tmp0;
-  ;  B[1] = tmp1;
-  ;  B[2] = tmp2;
-  ;  B[3] = tmp3;
-  ;}
-
-
-  ; Function Attrs: norecurse nounwind uwtable
-  define void @jumble1(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
-; CHECK-LABEL: @jumble1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load i32, i32* %A, align 4
-  %mul = mul nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
-  %2 = load i32, i32* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
-  %3 = load i32, i32* %arrayidx3, align 4
-  %mul4 = mul nsw i32 %2, %3
-  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
-  %4 = load i32, i32* %arrayidx5, align 4
-  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
-  %5 = load i32, i32* %arrayidx6, align 4
-  %mul7 = mul nsw i32 %4, %5
-  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
-  %6 = load i32, i32* %arrayidx8, align 4
-  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
-  %7 = load i32, i32* %arrayidx9, align 4
-  %mul10 = mul nsw i32 %6, %7
-  store i32 %mul, i32* %B, align 4
-  %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
-  store i32 %mul4, i32* %arrayidx12, align 4
-  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
-  store i32 %mul7, i32* %arrayidx13, align 4
-  %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
-  store i32 %mul10, i32* %arrayidx14, align 4
-  ret void
-  }
-
-;Reversing the operand of MUL
-  ; Function Attrs: norecurse nounwind uwtable
-  define void @jumble2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
-; CHECK-LABEL: @jumble2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    ret void
-;
-entry:
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load i32, i32* %A, align 4
-  %mul = mul nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
-  %2 = load i32, i32* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
-  %3 = load i32, i32* %arrayidx3, align 4
-  %mul4 = mul nsw i32 %3, %2
-  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
-  %4 = load i32, i32* %arrayidx5, align 4
-  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
-  %5 = load i32, i32* %arrayidx6, align 4
-  %mul7 = mul nsw i32 %5, %4
-  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
-  %6 = load i32, i32* %arrayidx8, align 4
-  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
-  %7 = load i32, i32* %arrayidx9, align 4
-  %mul10 = mul nsw i32 %7, %6
-  store i32 %mul, i32* %B, align 4
-  %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
-  store i32 %mul4, i32* %arrayidx12, align 4
-  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
-  store i32 %mul7, i32* %arrayidx13, align 4
-  %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
-  store i32 %mul10, i32* %arrayidx14, align 4
-  ret void
-  }
-
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
deleted file mode 100644
index 568fd9f3ac79..000000000000
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
+++ /dev/null
@@ -1,225 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
-
-;void phiUsingLoads(int *restrict A, int *restrict B) {
-;  int tmp0, tmp1, tmp2, tmp3;
-;  for (int i = 0; i < 100; i++) {
-;    if (A[0] == 0) {
-;      tmp0 = A[i + 0];
-;      tmp1 = A[i + 1];
-;      tmp2 = A[i + 2];
-;      tmp3 = A[i + 3];
-;    } else if (A[25] == 0) {
-;      tmp0 = A[i + 0];
-;      tmp1 = A[i + 1];
-;      tmp2 = A[i + 2];
-;      tmp3 = A[i + 3];
-;    } else if (A[50] == 0) {
-;      tmp0 = A[i + 0];
-;      tmp1 = A[i + 1];
-;      tmp2 = A[i + 2];
-;      tmp3 = A[i + 3];
-;    } else if (A[75] == 0) {
-;      tmp0 = A[i + 0];
-;      tmp1 = A[i + 1];
-;      tmp2 = A[i + 3];
-;      tmp3 = A[i + 2];
-;    }
-;  }
-;  B[0] = tmp0;
-;  B[1] = tmp1;
-;  B[2] = tmp2;
-;  B[3] = tmp3;
-;}
-
-
-; Function Attrs: norecurse nounwind uwtable
-define void @phiUsingLoads(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) local_unnamed_addr #0 {
-; CHECK-LABEL: @phiUsingLoads(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 25
-; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 50
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 75
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX66:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP27:%.*]], <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP27]], [[FOR_INC]] ]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[CMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[CMP13]], label [[IF_THEN14:%.*]], label [[IF_ELSE27:%.*]]
-; CHECK:       if.then14:
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX17]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       if.else27:
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX28]], align 4
-; CHECK-NEXT:    [[CMP29:%.*]] = icmp eq i32 [[TMP14]], 0
-; CHECK-NEXT:    br i1 [[CMP29]], label [[IF_THEN30:%.*]], label [[IF_ELSE43:%.*]]
-; CHECK:       if.then30:
-; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX33]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       if.else43:
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX44]], align 4
-; CHECK-NEXT:    [[CMP45:%.*]] = icmp eq i32 [[TMP20]], 0
-; CHECK-NEXT:    br i1 [[CMP45]], label [[IF_THEN46:%.*]], label [[FOR_INC]]
-; CHECK:       if.then46:
-; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX49]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[TMP27]] = phi <4 x i32> [ [[TMP7]], [[IF_THEN]] ], [ [[TMP13]], [[IF_THEN14]] ], [ [[TMP19]], [[IF_THEN30]] ], [ [[TMP26]], [[IF_THEN46]] ], [ [[TMP2]], [[IF_ELSE43]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-;
-entry:
-  %0 = load i32, i32* %A, align 4
-  %cmp1 = icmp eq i32 %0, 0
-  %arrayidx12 = getelementptr inbounds i32, i32* %A, i64 25
-  %arrayidx28 = getelementptr inbounds i32, i32* %A, i64 50
-  %arrayidx44 = getelementptr inbounds i32, i32* %A, i64 75
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.inc
-  store i32 %tmp0.1, i32* %B, align 4
-  %arrayidx64 = getelementptr inbounds i32, i32* %B, i64 1
-  store i32 %tmp1.1, i32* %arrayidx64, align 4
-  %arrayidx65 = getelementptr inbounds i32, i32* %B, i64 2
-  store i32 %tmp2.1, i32* %arrayidx65, align 4
-  %arrayidx66 = getelementptr inbounds i32, i32* %B, i64 3
-  store i32 %tmp3.1, i32* %arrayidx66, align 4
-  ret void
-
-for.body:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
-  %tmp3.0111 = phi i32 [ undef, %entry ], [ %tmp3.1, %for.inc ]
-  %tmp2.0110 = phi i32 [ undef, %entry ], [ %tmp2.1, %for.inc ]
-  %tmp1.0109 = phi i32 [ undef, %entry ], [ %tmp1.1, %for.inc ]
-  %tmp0.0108 = phi i32 [ undef, %entry ], [ %tmp0.1, %for.inc ]
-  br i1 %cmp1, label %if.then, label %if.else
-
-if.then:                                          ; preds = %for.body
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx2, align 4
-  %2 = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %2
-  %3 = load i32, i32* %arrayidx5, align 4
-  %4 = add nuw nsw i64 %indvars.iv, 2
-  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %4
-  %5 = load i32, i32* %arrayidx8, align 4
-  %6 = add nuw nsw i64 %indvars.iv, 3
-  %arrayidx11 = getelementptr inbounds i32, i32* %A, i64 %6
-  %7 = load i32, i32* %arrayidx11, align 4
-  br label %for.inc
-
-if.else:                                          ; preds = %for.body
-  %8 = load i32, i32* %arrayidx12, align 4
-  %cmp13 = icmp eq i32 %8, 0
-  br i1 %cmp13, label %if.then14, label %if.else27
-
-if.then14:                                        ; preds = %if.else
-  %arrayidx17 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %9 = load i32, i32* %arrayidx17, align 4
-  %10 = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx20 = getelementptr inbounds i32, i32* %A, i64 %10
-  %11 = load i32, i32* %arrayidx20, align 4
-  %12 = add nuw nsw i64 %indvars.iv, 2
-  %arrayidx23 = getelementptr inbounds i32, i32* %A, i64 %12
-  %13 = load i32, i32* %arrayidx23, align 4
-  %14 = add nuw nsw i64 %indvars.iv, 3
-  %arrayidx26 = getelementptr inbounds i32, i32* %A, i64 %14
-  %15 = load i32, i32* %arrayidx26, align 4
-  br label %for.inc
-
-if.else27:                                        ; preds = %if.else
-  %16 = load i32, i32* %arrayidx28, align 4
-  %cmp29 = icmp eq i32 %16, 0
-  br i1 %cmp29, label %if.then30, label %if.else43
-
-if.then30:                                        ; preds = %if.else27
-  %arrayidx33 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %17 = load i32, i32* %arrayidx33, align 4
-  %18 = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx36 = getelementptr inbounds i32, i32* %A, i64 %18
-  %19 = load i32, i32* %arrayidx36, align 4
-  %20 = add nuw nsw i64 %indvars.iv, 2
-  %arrayidx39 = getelementptr inbounds i32, i32* %A, i64 %20
-  %21 = load i32, i32* %arrayidx39, align 4
-  %22 = add nuw nsw i64 %indvars.iv, 3
-  %arrayidx42 = getelementptr inbounds i32, i32* %A, i64 %22
-  %23 = load i32, i32* %arrayidx42, align 4
-  br label %for.inc
-
-if.else43:                                        ; preds = %if.else27
-  %24 = load i32, i32* %arrayidx44, align 4
-  %cmp45 = icmp eq i32 %24, 0
-  br i1 %cmp45, label %if.then46, label %for.inc
-
-if.then46:                                        ; preds = %if.else43
-  %arrayidx49 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %25 = load i32, i32* %arrayidx49, align 4
-  %26 = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx52 = getelementptr inbounds i32, i32* %A, i64 %26
-  %27 = load i32, i32* %arrayidx52, align 4
-  %28 = add nuw nsw i64 %indvars.iv, 3
-  %arrayidx55 = getelementptr inbounds i32, i32* %A, i64 %28
-  %29 = load i32, i32* %arrayidx55, align 4
-  %30 = add nuw nsw i64 %indvars.iv, 2
-  %arrayidx58 = getelementptr inbounds i32, i32* %A, i64 %30
-  %31 = load i32, i32* %arrayidx58, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.then, %if.then30, %if.else43, %if.then46, %if.then14
-  %tmp0.1 = phi i32 [ %1, %if.then ], [ %9, %if.then14 ], [ %17, %if.then30 ], [ %25, %if.then46 ], [ %tmp0.0108, %if.else43 ]
-  %tmp1.1 = phi i32 [ %3, %if.then ], [ %11, %if.then14 ], [ %19, %if.then30 ], [ %27, %if.then46 ], [ %tmp1.0109, %if.else43 ]
-  %tmp2.1 = phi i32 [ %5, %if.then ], [ %13, %if.then14 ], [ %21, %if.then30 ], [ %29, %if.then46 ], [ %tmp2.0110, %if.else43 ]
-  %tmp3.1 = phi i32 [ %7, %if.then ], [ %15, %if.then14 ], [ %23, %if.then30 ], [ %31, %if.then46 ], [ %tmp3.0111, %if.else43 ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
index be58521ed898..06e051a90b0d 100644
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
@@ -5,27 +5,34 @@
 
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
-; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
-; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
-; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
-; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]]
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_7]], align 4
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_8]], align 4
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_9]], align 4
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3
+; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_10]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0
diff --git a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
index 6ae763520013..1b2c76384e0b 100644
--- a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
+++ b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
@@ -6,26 +6,33 @@
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_9]], align 4
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_7]], align 4
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_10]], align 4
+; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_8]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0
diff --git a/test/Transforms/SampleProfile/entry_counts.ll b/test/Transforms/SampleProfile/entry_counts.ll
index 6137a6908cf5..cab7c87e0493 100644
--- a/test/Transforms/SampleProfile/entry_counts.ll
+++ b/test/Transforms/SampleProfile/entry_counts.ll
@@ -9,8 +9,8 @@ entry:
   ret void, !dbg !9
 }
 
-; This function does not have profile, check if function_entry_count is 0
-; CHECK: {{.*}} = !{!"function_entry_count", i64 0}
+; This function does not have profile, check if function_entry_count is -1
+; CHECK: {{.*}} = !{!"function_entry_count", i64 -1}
 define void @no_profile() {
 entry:
   ret void
diff --git a/test/Transforms/SimplifyCFG/X86/if-conversion.ll b/test/Transforms/SimplifyCFG/X86/if-conversion.ll
new file mode 100644
index 000000000000..28702572d480
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/if-conversion.ll
@@ -0,0 +1,231 @@
+; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S | FileCheck %s
+; Avoid if-conversion if there is a long dependence chain.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; The first several cases test FindLongDependenceChain returns true, so
+; if-conversion is blocked.
+
+define i64 @test1(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-NOT: select
+}
+
+define i64 @test2(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = add i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: select
+}
+
+; The following cases test FindLongDependenceChain returns false, so
+; if-conversion will proceed.
+
+; Non trivial LatencyAdjustment.
+define i64 @test3(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %p4 = add i64 %p3, %1
+  %ptr = inttoptr i64 %p4 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test3
+; CHECK: select
+}
+
+; Short dependence chain.
+define i64 @test4(i64* %pp, i64* %p) {
+entry:
+  %0 = load i64, i64* %pp, align 8
+  %cmp = icmp slt i64 %0, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test4
+; CHECK: select
+}
+
+; High IPC.
+define i64 @test5(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  %2 = add i64 %pint, 2
+  %3 = add i64 %pint, 3
+  %4 = or i64 %pint, 16
+  %5 = and i64 %pint, 255
+
+  %6 = or i64 %2, 9
+  %7 = and i64 %3, 255
+  %8 = add i64 %4, 4
+  %9 = add i64 %5, 5
+
+  %10 = add i64 %6, 2
+  %11 = add i64 %7, 3
+  %12 = add i64 %8, 4
+  %13 = add i64 %9, 5
+
+  %14 = add i64 %10, 6
+  %15 = add i64 %11, 7
+  %16 = add i64 %12, 8
+  %17 = add i64 %13, 9
+
+  %18 = add i64 %14, 10
+  %19 = add i64 %15, 11
+  %20 = add i64 %16, 12
+  %21 = add i64 %17, 13
+
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+
+  ret i64 %val
+
+; CHECK-LABEL: @test5
+; CHECK: select
+}
+
+; Large BB size.
+define i64 @test6(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  %2 = add i64 %pint, 2
+  %3 = add i64 %pint, 3
+  %4 = add i64 %2, 4
+  %5 = add i64 %3, 5
+  %6 = add i64 %4, 6
+  %7 = add i64 %5, 7
+  %8 = add i64 %6, 6
+  %9 = add i64 %7, 7
+  %10 = add i64 %8, 6
+  %11 = add i64 %9, 7
+  %12 = add i64 %10, 6
+  %13 = add i64 %11, 7
+  %14 = add i64 %12, 6
+  %15 = add i64 %13, 7
+  %16 = add i64 %14, 6
+  %17 = add i64 %15, 7
+  %18 = add i64 %16, 6
+  %19 = add i64 %17, 7
+  %20 = add i64 %18, 6
+  %21 = add i64 %19, 7
+  %22 = add i64 %20, 6
+  %23 = add i64 %21, 7
+  %24 = add i64 %22, 6
+  %25 = add i64 %23, 7
+  %26 = add i64 %24, 6
+  %27 = add i64 %25, 7
+  %28 = add i64 %26, 6
+  %29 = add i64 %27, 7
+  %30 = add i64 %28, 6
+  %31 = add i64 %29, 7
+  %32 = add i64 %30, 8
+  %33 = add i64 %31, 9
+  %34 = add i64 %32, %33
+  %35 = and i64 %34, 255
+  %res = add i64 %val, %35
+
+  ret i64 %res
+
+; CHECK-LABEL: @test6
+; CHECK: select
+}
diff --git a/test/tools/llvm-cvtres/machine.test b/test/tools/llvm-cvtres/machine.test
index cac36062f301..5ce6cbaf697d 100644
--- a/test/tools/llvm-cvtres/machine.test
+++ b/test/tools/llvm-cvtres/machine.test
@@ -29,46 +29,46 @@ X86:         Machine: IMAGE_FILE_MACHINE_I386 (0x14C)
 X86-DAG:   Relocations [
 X86-DAG:                 .rsrc$01 {
 X86-NEXT:      0x1E8 IMAGE_REL_I386_DIR32NB $R000000
-X86-NEXT:      0x198 IMAGE_REL_I386_DIR32NB $R000018
-X86-NEXT:      0x1A8 IMAGE_REL_I386_DIR32NB $R000340
-X86-NEXT:      0x1C8 IMAGE_REL_I386_DIR32NB $R000668
-X86-NEXT:      0x1D8 IMAGE_REL_I386_DIR32NB $R000698
-X86-NEXT:      0x1F8 IMAGE_REL_I386_DIR32NB $R000708
-X86-NEXT:      0x1B8 IMAGE_REL_I386_DIR32NB $R000720
-X86-NEXT:      0x188 IMAGE_REL_I386_DIR32NB $R000750
+X86-NEXT:      0x198 IMAGE_REL_I386_DIR32NB $R000001
+X86-NEXT:      0x1A8 IMAGE_REL_I386_DIR32NB $R000002
+X86-NEXT:      0x1C8 IMAGE_REL_I386_DIR32NB $R000003
+X86-NEXT:      0x1D8 IMAGE_REL_I386_DIR32NB $R000004
+X86-NEXT:      0x1F8 IMAGE_REL_I386_DIR32NB $R000005
+X86-NEXT:      0x1B8 IMAGE_REL_I386_DIR32NB $R000006
+X86-NEXT:      0x188 IMAGE_REL_I386_DIR32NB $R000007
 
 X64:         Machine: IMAGE_FILE_MACHINE_AMD64 (0x8664)
 X64-DAG:   Relocations [
 X64-DAG:                 .rsrc$01 {
 X64-NEXT:      0x1E8 IMAGE_REL_AMD64_ADDR32NB $R000000
-X64-NEXT:      0x198 IMAGE_REL_AMD64_ADDR32NB $R000018
-X64-NEXT:      0x1A8 IMAGE_REL_AMD64_ADDR32NB $R000340
-X64-NEXT:      0x1C8 IMAGE_REL_AMD64_ADDR32NB $R000668
-X64-NEXT:      0x1D8 IMAGE_REL_AMD64_ADDR32NB $R000698
-X64-NEXT:      0x1F8 IMAGE_REL_AMD64_ADDR32NB $R000708
-X64-NEXT:      0x1B8 IMAGE_REL_AMD64_ADDR32NB $R000720
-X64-NEXT:      0x188 IMAGE_REL_AMD64_ADDR32NB $R000750
+X64-NEXT:      0x198 IMAGE_REL_AMD64_ADDR32NB $R000001
+X64-NEXT:      0x1A8 IMAGE_REL_AMD64_ADDR32NB $R000002
+X64-NEXT:      0x1C8 IMAGE_REL_AMD64_ADDR32NB $R000003
+X64-NEXT:      0x1D8 IMAGE_REL_AMD64_ADDR32NB $R000004
+X64-NEXT:      0x1F8 IMAGE_REL_AMD64_ADDR32NB $R000005
+X64-NEXT:      0x1B8 IMAGE_REL_AMD64_ADDR32NB $R000006
+X64-NEXT:      0x188 IMAGE_REL_AMD64_ADDR32NB $R000007
 
 ARM:         Machine: IMAGE_FILE_MACHINE_ARMNT (0x1C4)
 ARM-DAG:   Relocations [
 ARM-DAG:                 .rsrc$01 {
 ARM-NEXT:      0x1E8 IMAGE_REL_ARM_ADDR32NB $R000000
-ARM-NEXT:      0x198 IMAGE_REL_ARM_ADDR32NB $R000018
-ARM-NEXT:      0x1A8 IMAGE_REL_ARM_ADDR32NB $R000340
-ARM-NEXT:      0x1C8 IMAGE_REL_ARM_ADDR32NB $R000668
-ARM-NEXT:      0x1D8 IMAGE_REL_ARM_ADDR32NB $R000698
-ARM-NEXT:      0x1F8 IMAGE_REL_ARM_ADDR32NB $R000708
-ARM-NEXT:      0x1B8 IMAGE_REL_ARM_ADDR32NB $R000720
-ARM-NEXT:      0x188 IMAGE_REL_ARM_ADDR32NB $R000750
+ARM-NEXT:      0x198 IMAGE_REL_ARM_ADDR32NB $R000001
+ARM-NEXT:      0x1A8 IMAGE_REL_ARM_ADDR32NB $R000002
+ARM-NEXT:      0x1C8 IMAGE_REL_ARM_ADDR32NB $R000003
+ARM-NEXT:      0x1D8 IMAGE_REL_ARM_ADDR32NB $R000004
+ARM-NEXT:      0x1F8 IMAGE_REL_ARM_ADDR32NB $R000005
+ARM-NEXT:      0x1B8 IMAGE_REL_ARM_ADDR32NB $R000006
+ARM-NEXT:      0x188 IMAGE_REL_ARM_ADDR32NB $R000007
 
 ARM64:       Machine: IMAGE_FILE_MACHINE_ARM64 (0xAA64)
 ARM64-DAG: Relocations [
 ARM64-DAG:               .rsrc$01 {
 ARM64-NEXT:    0x1E8 IMAGE_REL_ARM64_ADDR32NB $R000000
-ARM64-NEXT:    0x198 IMAGE_REL_ARM64_ADDR32NB $R000018
-ARM64-NEXT:    0x1A8 IMAGE_REL_ARM64_ADDR32NB $R000340
-ARM64-NEXT:    0x1C8 IMAGE_REL_ARM64_ADDR32NB $R000668
-ARM64-NEXT:    0x1D8 IMAGE_REL_ARM64_ADDR32NB $R000698
-ARM64-NEXT:    0x1F8 IMAGE_REL_ARM64_ADDR32NB $R000708
-ARM64-NEXT:    0x1B8 IMAGE_REL_ARM64_ADDR32NB $R000720
-ARM64-NEXT:    0x188 IMAGE_REL_ARM64_ADDR32NB $R000750
+ARM64-NEXT:    0x198 IMAGE_REL_ARM64_ADDR32NB $R000001
+ARM64-NEXT:    0x1A8 IMAGE_REL_ARM64_ADDR32NB $R000002
+ARM64-NEXT:    0x1C8 IMAGE_REL_ARM64_ADDR32NB $R000003
+ARM64-NEXT:    0x1D8 IMAGE_REL_ARM64_ADDR32NB $R000004
+ARM64-NEXT:    0x1F8 IMAGE_REL_ARM64_ADDR32NB $R000005
+ARM64-NEXT:    0x1B8 IMAGE_REL_ARM64_ADDR32NB $R000006
+ARM64-NEXT:    0x188 IMAGE_REL_ARM64_ADDR32NB $R000007
diff --git a/test/tools/llvm-cvtres/symbols.test b/test/tools/llvm-cvtres/symbols.test
index 2ca3a193ac40..14f5c360d454 100644
--- a/test/tools/llvm-cvtres/symbols.test
+++ b/test/tools/llvm-cvtres/symbols.test
@@ -13,21 +13,21 @@ RUN: llvm-readobj -symbols %t | FileCheck %s
 CHECK:        Name: $R000000
 CHECK-NEXT:    Value: 0
 CHECK-NEXT:    Section: .rsrc$02
-CHECK:        Name: $R000018
+CHECK:        Name: $R000001
 CHECK-NEXT:    Value: 24
 CHECK-NEXT:    Section: .rsrc$02
-CHECK:        Name: $R000340
+CHECK:        Name: $R000002
 CHECK-NEXT:    Value: 832
 CHECK-NEXT:    Section: .rsrc$02
-CHECK:        Name: $R000668
+CHECK:        Name: $R000003
 CHECK-NEXT:    Value: 1640
 CHECK-NEXT:    Section: .rsrc$02
-CHECK:        Name: $R000698
+CHECK:        Name: $R000004
 CHECK-NEXT:    Value: 1688
 CHECK-NEXT:    Section: .rsrc$02
-CHECK:        Name: $R000720
+CHECK:        Name: $R000006
 CHECK-NEXT:    Value: 1824
 CHECK-NEXT:    Section: .rsrc$02
-CHECK:        Name: $R000750
+CHECK:        Name: $R000007
 CHECK-NEXT:    Value: 1872
 CHECK-NEXT:    Section: .rsrc$02
diff --git a/test/tools/llvm-dwarfdump/X86/lookup.s b/test/tools/llvm-dwarfdump/X86/lookup.s
index 652844447c11..d09528d667fa 100644
--- a/test/tools/llvm-dwarfdump/X86/lookup.s
+++ b/test/tools/llvm-dwarfdump/X86/lookup.s
@@ -1,5 +1,10 @@
 # RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \
-# RUN:   | llvm-dwarfdump -lookup=0x7fffffff - | \
+# RUN:   | llvm-dwarfdump -lookup=0xffffffff - | \
+# RUN: FileCheck %s --check-prefix=EMPTY --allow-empty
+# EMPTY: {{^$}}
+
+# RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \
+# RUN:   | llvm-dwarfdump -lookup=0xffffffffffffffff - | \
 # RUN: FileCheck %s --check-prefix=EMPTY --allow-empty
 # EMPTY: {{^$}}
 
diff --git a/test/tools/llvm-objcopy/add-section-remove.test b/test/tools/llvm-objcopy/add-section-remove.test
new file mode 100644
index 000000000000..0dee1182a28d
--- /dev/null
+++ b/test/tools/llvm-objcopy/add-section-remove.test
@@ -0,0 +1,36 @@
+# RUN: yaml2obj %s > %t
+# RUN: echo 0000 > %t.sec
+# RUN: llvm-objcopy -R=.test2 -add-section=.test2=%t.sec %t %t2
+# RUN: llvm-readobj -file-headers -sections -section-data %t2 | FileCheck %s
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .test1
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:        "c3c3c3c3"
+  - Name:            .test2
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:        "DEADBEEF"
+  - Name:            .test3
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:        "32323232"
+
+# CHECK: SectionHeaderCount: 7
+
+# CHECK: Name: .test1
+# CHECK: Name: .test3
+# CHECK: Name: .symtab
+# CHECK: Name: .strtab
+# CHECK: Name: .shstrtab
+# CHECK: Name: .test2
+# CHECK: SectionData (
+# CHECK-NEXT:   0000: 30303030
+# CHECK-NEXT: )
diff --git a/test/tools/llvm-objcopy/add-section.test b/test/tools/llvm-objcopy/add-section.test
new file mode 100644
index 000000000000..048edcba227f
--- /dev/null
+++ b/test/tools/llvm-objcopy/add-section.test
@@ -0,0 +1,37 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-objcopy -O binary -j .test2 %t %t.sec
+# RUN: llvm-objcopy -R=.test2 %t %t2
+# RUN: llvm-objcopy -add-section=.test2=%t.sec %t2 %t3
+# RUN: llvm-readobj -file-headers -sections -section-data %t3 | FileCheck %s
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .test1
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:        "c3c3c3c3"
+  - Name:            .test2
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:        "DEADBEEF"
+  - Name:            .test3
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Content:        "32323232"
+
+# CHECK: SectionHeaderCount: 7
+
+# CHECK: Name: .test1
+# CHECK: Name: .test3
+# CHECK: Name: .symtab
+# CHECK: Name: .strtab
+# CHECK: Name: .shstrtab
+# CHECK: Name: .test2
+# CHECK: SectionData (
+# CHECK-NEXT:   0000: DEADBEEF
+# CHECK-NEXT: )
diff --git a/test/tools/llvm-readobj/mips-got.test b/test/tools/llvm-readobj/mips-got.test
index 65ccf13f2b4c..a5c15fdfe230 100644
--- a/test/tools/llvm-readobj/mips-got.test
+++ b/test/tools/llvm-readobj/mips-got.test
@@ -1,4 +1,4 @@
-RUN: llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips | \
+RUN: not llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips 2>&1 | \
 RUN:   FileCheck %s -check-prefix GOT-OBJ
 RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-exe.mips | \
 RUN:   FileCheck %s -check-prefix GOT-EXE
@@ -11,7 +11,26 @@ RUN:   FileCheck %s -check-prefix GOT-EMPTY
 RUN: llvm-readobj -mips-plt-got %p/Inputs/got-static.exe.mips | \
 RUN:   FileCheck %s -check-prefix GOT-STATIC
 
-GOT-OBJ: Cannot find .got section
+RUN: not llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips \
+RUN:                  --elf-output-style=GNU 2>&1 | \
+RUN:   FileCheck %s -check-prefix GNU-GOT-OBJ
+RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-exe.mips \
+RUN:                  --elf-output-style=GNU | \
+RUN:   FileCheck %s -check-prefix GNU-GOT-EXE
+RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-so.mips \
+RUN:                  --elf-output-style=GNU | \
+RUN:   FileCheck %s -check-prefix GNU-GOT-SO
+RUN: llvm-readobj -mips-plt-got %p/Inputs/got-tls.so.elf-mips64el \
+RUN:                  --elf-output-style=GNU | \
+RUN:   FileCheck %s -check-prefix GNU-GOT-TLS
+RUN: llvm-readobj -mips-plt-got %p/Inputs/got-empty.exe.mipsel \
+RUN:                  --elf-output-style=GNU | \
+RUN:   FileCheck %s -check-prefix GNU-GOT-EMPTY
+RUN: llvm-readobj -mips-plt-got %p/Inputs/got-static.exe.mips \
+RUN:                  --elf-output-style=GNU | \
+RUN:   FileCheck %s -check-prefix GNU-GOT-STATIC
+
+GOT-OBJ: Error reading file: Cannot find .got section.
 
 GOT-EXE:      Primary GOT {
 GOT-EXE-NEXT:   Canonical gp value: 0x418880
@@ -366,3 +385,116 @@ GOT-STATIC-NEXT:       Initial: 0x400104
 GOT-STATIC-NEXT:     }
 GOT-STATIC-NEXT:   ]
 GOT-STATIC-NEXT: }
+
+GNU-GOT-OBJ: Error reading file: Cannot find .got section.
+
+GNU-GOT-EXE:      Primary GOT:
+GNU-GOT-EXE-NEXT:  Canonical gp value: 00418880
+
+GNU-GOT-EXE:       Reserved entries:
+GNU-GOT-EXE-NEXT:    Address     Access  Initial Purpose
+GNU-GOT-EXE-NEXT:   00410890 -32752(gp) 00000000 Lazy resolver
+GNU-GOT-EXE-NEXT:   00410894 -32748(gp) 80000000 Module pointer (GNU extension)
+
+GNU-GOT-EXE:       Local entries:
+GNU-GOT-EXE-NEXT:    Address     Access  Initial
+GNU-GOT-EXE-NEXT:   00410898 -32744(gp) 00400418
+GNU-GOT-EXE-NEXT:   0041089c -32740(gp) 00410840
+GNU-GOT-EXE-NEXT:   004108a0 -32736(gp) 00000000
+
+GNU-GOT-EXE:       Global entries:
+GNU-GOT-EXE-NEXT:    Address     Access  Initial Sym.Val. Type    Ndx Name
+GNU-GOT-EXE-NEXT:   004108a4 -32732(gp) 00000000 00000000 FUNC    UND __gmon_start__
+
+GNU-GOT-EXE:      PLT GOT:
+
+GNU-GOT-EXE:       Reserved entries:
+GNU-GOT-EXE-NEXT:    Address  Initial Purpose
+GNU-GOT-EXE-NEXT:   00410854 00000000 PLT lazy resolver
+GNU-GOT-EXE-NEXT:   00410894 80000000 Module pointer
+
+GNU-GOT-EXE:       Entries:
+GNU-GOT-EXE-NEXT:    Address  Initial Sym.Val. Type    Ndx Name
+GNU-GOT-EXE-NEXT:   0041085c 00400800 00000000 FUNC    UND puts
+GNU-GOT-EXE-NEXT:   00410860 00400800 00000000 FUNC    UND __libc_start_main
+
+GNU-GOT-SO:      Primary GOT:
+GNU-GOT-SO-NEXT:  Canonical gp value: 000188d0
+
+GNU-GOT-SO:       Reserved entries:
+GNU-GOT-SO-NEXT:    Address     Access  Initial Purpose
+GNU-GOT-SO-NEXT:   000108e0 -32752(gp) 00000000 Lazy resolver
+GNU-GOT-SO-NEXT:   000108e4 -32748(gp) 80000000 Module pointer (GNU extension)
+
+GNU-GOT-SO:       Local entries:
+GNU-GOT-SO-NEXT:    Address     Access  Initial
+GNU-GOT-SO-NEXT:   000108e8 -32744(gp) 000108e0
+GNU-GOT-SO-NEXT:   000108ec -32740(gp) 00010000
+GNU-GOT-SO-NEXT:   000108f0 -32736(gp) 00010920
+GNU-GOT-SO-NEXT:   000108f4 -32732(gp) 000108cc
+GNU-GOT-SO-NEXT:   000108f8 -32728(gp) 00000000
+GNU-GOT-SO-NEXT:   000108fc -32724(gp) 00000000
+GNU-GOT-SO-NEXT:   00010900 -32720(gp) 00000000
+GNU-GOT-SO-NEXT:   00010904 -32716(gp) 00000000
+
+GNU-GOT-SO:       Global entries:
+GNU-GOT-SO-NEXT:    Address     Access  Initial Sym.Val. Type    Ndx Name
+GNU-GOT-SO-NEXT:   00010908 -32712(gp) 00000000 00000000 NOTYPE  UND _ITM_registerTMCloneTable
+GNU-GOT-SO-NEXT:   0001090c -32708(gp) 00000000 00000000 NOTYPE  UND _Jv_RegisterClasses
+GNU-GOT-SO-NEXT:   00010910 -32704(gp) 00000000 00000000 FUNC    UND __gmon_start__
+GNU-GOT-SO-NEXT:   00010914 -32700(gp) 00000840 00000840 FUNC    UND puts
+GNU-GOT-SO-NEXT:   00010918 -32696(gp) 00000000 00000000 NOTYPE  UND _ITM_deregisterTMCloneTable
+GNU-GOT-SO-NEXT:   0001091c -32692(gp) 00000000 00000000 FUNC    UND __cxa_finalize
+
+GNU-GOT-TLS:      Primary GOT:
+GNU-GOT-TLS-NEXT:  Canonical gp value: 0000000000018bf0
+
+GNU-GOT-TLS:       Reserved entries:
+GNU-GOT-TLS-NEXT:            Address     Access          Initial Purpose
+GNU-GOT-TLS-NEXT:   0000000000010c00 -32752(gp) 0000000000000000 Lazy resolver
+GNU-GOT-TLS-NEXT:   0000000000010c08 -32744(gp) 8000000000000000 Module pointer (GNU extension)
+
+GNU-GOT-TLS:       Local entries:
+GNU-GOT-TLS-NEXT:            Address     Access          Initial
+GNU-GOT-TLS-NEXT:   0000000000010c10 -32736(gp) 0000000000010000
+GNU-GOT-TLS-NEXT:   0000000000010c18 -32728(gp) 0000000000010c00
+GNU-GOT-TLS-NEXT:   0000000000010c20 -32720(gp) 0000000000010cb8
+GNU-GOT-TLS-NEXT:   0000000000010c28 -32712(gp) 0000000000010bf0
+GNU-GOT-TLS-NEXT:   0000000000010c30 -32704(gp) 0000000000000000
+GNU-GOT-TLS-NEXT:   0000000000010c38 -32696(gp) 0000000000000948
+GNU-GOT-TLS-NEXT:   0000000000010c40 -32688(gp) 0000000000000a20
+GNU-GOT-TLS-NEXT:   0000000000010c48 -32680(gp) 0000000000000af0
+GNU-GOT-TLS-NEXT:   0000000000010c50 -32672(gp) 0000000000000000
+GNU-GOT-TLS-NEXT:   0000000000010c58 -32664(gp) 0000000000000000
+GNU-GOT-TLS-NEXT:   0000000000010c60 -32656(gp) 0000000000000000
+
+GNU-GOT-TLS:       Global entries:
+GNU-GOT-TLS-NEXT:            Address     Access          Initial         Sym.Val. Type    Ndx Name
+GNU-GOT-TLS-NEXT:   0000000000010c68 -32648(gp) 0000000000000000 0000000000000000 NOTYPE  UND _ITM_registerTMCloneTable
+GNU-GOT-TLS-NEXT:   0000000000010c70 -32640(gp) 0000000000000000 0000000000000000 NOTYPE  UND _Jv_RegisterClasses
+GNU-GOT-TLS-NEXT:   0000000000010c78 -32632(gp) 0000000000000000 0000000000000000 FUNC    UND __gmon_start__
+GNU-GOT-TLS-NEXT:   0000000000010c80 -32624(gp) 0000000000000b60 0000000000000b60 FUNC    UND __tls_get_addr
+GNU-GOT-TLS-NEXT:   0000000000010c88 -32616(gp) 0000000000000000 0000000000000000 NOTYPE  UND _ITM_deregisterTMCloneTable
+GNU-GOT-TLS-NEXT:   0000000000010c90 -32608(gp) 0000000000000000 0000000000000000 FUNC    UND __cxa_finalize
+
+GNU-GOTY     : Primary GOT:
+GNU-GOT-EMPTY:  Canonical gp value: 00409ff0
+
+GNU-GOTY     :  Reserved entries:
+GNU-GOT-EMPTY:    Address     Access  Initial Purpose
+GNU-GOT-EMPTY:   00402000 -32752(gp) 00000000 Lazy resolver
+GNU-GOT-EMPTY:   00402004 -32748(gp) 80000000 Module pointer (GNU extension)
+
+GNU-GOT-STATIC:      Static GOT:
+GNU-GOT-STATIC-NEXT:  Canonical gp value: 00418100
+
+GNU-GOT-STATIC:       Reserved entries:
+GNU-GOT-STATIC-NEXT:    Address     Access  Initial Purpose
+GNU-GOT-STATIC-NEXT:   00410110 -32752(gp) 00000000 Lazy resolver
+GNU-GOT-STATIC-NEXT:   00410114 -32748(gp) 80000000 Module pointer (GNU extension)
+
+GNU-GOT-STATIC:       Local entries:
+GNU-GOT-STATIC-NEXT:    Address     Access  Initial
+GNU-GOT-STATIC-NEXT:   00410118 -32744(gp) 00400000
+GNU-GOT-STATIC-NEXT:   0041011c -32740(gp) 00400100
+GNU-GOT-STATIC-NEXT:   00410120 -32736(gp) 00400104
diff --git a/test/tools/llvm-readobj/mips-plt.test b/test/tools/llvm-readobj/mips-plt.test
index ab0824b0be68..f41940c9cf34 100644
--- a/test/tools/llvm-readobj/mips-plt.test
+++ b/test/tools/llvm-readobj/mips-plt.test
@@ -1,4 +1,7 @@
 RUN: llvm-readobj -mips-plt-got %p/Inputs/got-plt.exe.elf-mipsel | FileCheck %s
+RUN: llvm-readobj -mips-plt-got --elf-output-style=GNU \
+RUN:              %p/Inputs/got-plt.exe.elf-mipsel \
+RUN:   | FileCheck --check-prefix=GNU %s
 
 CHECK:      PLT GOT {
 CHECK-NEXT:   Reserved entries [
@@ -32,3 +35,32 @@ CHECK-NEXT:       Name: __libc_start_main@GLIBC_2.0 (53)
 CHECK-NEXT:     }
 CHECK-NEXT:   ]
 CHECK-NEXT: }
+
+GNU:      Primary GOT:
+GNU-NEXT:  Canonical gp value: 00418840
+
+GNU:       Reserved entries:
+GNU-NEXT:    Address     Access  Initial Purpose
+GNU-NEXT:   00410850 -32752(gp) 00000000 Lazy resolver
+GNU-NEXT:   00410854 -32748(gp) 80000000 Module pointer (GNU extension)
+
+GNU:       Local entries:
+GNU-NEXT:    Address     Access  Initial
+GNU-NEXT:   00410858 -32744(gp) 004003d4
+GNU-NEXT:   0041085c -32740(gp) 00410800
+GNU-NEXT:   00410860 -32736(gp) 00000000
+
+GNU:       Global entries:
+GNU-NEXT:    Address     Access  Initial Sym.Val. Type    Ndx Name
+GNU-NEXT:   00410864 -32732(gp) 00000000 00000000 FUNC    UND __gmon_start__
+GNU-NEXT: PLT GOT:
+
+GNU:       Reserved entries:
+GNU-NEXT:    Address  Initial Purpose
+GNU-NEXT:   00410814 00000000 PLT lazy resolver
+GNU-NEXT:   00410854 80000000 Module pointer
+
+GNU:       Entries:
+GNU-NEXT:    Address  Initial Sym.Val. Type    Ndx Name
+GNU-NEXT:   0041081c 004007c0 00000000 FUNC    UND puts
+GNU-NEXT:   00410820 004007c0 00000000 FUNC    UND __libc_start_main
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index e4e34efff842..12c005de6005 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -156,8 +156,7 @@ static list<std::string> Name(
          "the -regex option <pattern> is interpreted as a regular expression."),
     value_desc("pattern"), cat(DwarfDumpCategory));
 static alias NameAlias("n", desc("Alias for -name"), aliasopt(Name));
-static opt<unsigned>
-    Lookup("lookup",
+static opt<unsigned long long> Lookup("lookup",
            desc("Lookup <address> in the debug information and print out any"
                 "available file, function, block and line table details."),
            value_desc("address"), cat(DwarfDumpCategory));
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 20c6813968be..7d71a3e8dfe3 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -156,6 +156,9 @@ static cl::opt<std::string> ThinLTOModuleId(
 static cl::opt<std::string>
     ThinLTOCacheDir("thinlto-cache-dir", cl::desc("Enable ThinLTO caching."));
 
+static cl::opt<int>
+    ThinLTOCachePruningInterval("thinlto-cache-pruning-interval", cl::desc("Set ThinLTO cache pruning interval."));
+
 static cl::opt<std::string> ThinLTOSaveTempsPrefix(
     "thinlto-save-temps",
     cl::desc("Save ThinLTO temp files using filenames created by adding "
@@ -470,6 +473,7 @@ public:
     ThinGenerator.setCodePICModel(getRelocModel());
     ThinGenerator.setTargetOptions(Options);
     ThinGenerator.setCacheDir(ThinLTOCacheDir);
+    ThinGenerator.setCachePruningInterval(ThinLTOCachePruningInterval);
     ThinGenerator.setFreestanding(EnableFreestanding);
 
     // Add all the exported symbols to the table of symbols to preserve.
diff --git a/tools/llvm-objcopy/Object.cpp b/tools/llvm-objcopy/Object.cpp
index bd5bcd7fc188..d5dfcac40e4e 100644
--- a/tools/llvm-objcopy/Object.cpp
+++ b/tools/llvm-objcopy/Object.cpp
@@ -81,6 +81,11 @@ void Section::writeSection(FileOutputBuffer &Out) const {
   std::copy(std::begin(Contents), std::end(Contents), Buf);
 }
 
+void OwnedDataSection::writeSection(FileOutputBuffer &Out) const {
+  uint8_t *Buf = Out.getBufferStart() + Offset;
+  std::copy(std::begin(Data), std::end(Data), Buf);
+}
+
 void StringTableSection::addString(StringRef Name) {
   StrTabBuilder.add(Name);
   Size = StrTabBuilder.getSize();
@@ -676,6 +681,13 @@ void Object<ELFT>::removeSections(
   Sections.erase(Iter, std::end(Sections));
 }
 
+template <class ELFT>
+void Object<ELFT>::addSection(StringRef SecName, ArrayRef<uint8_t> Data) {
+  auto Sec = llvm::make_unique<OwnedDataSection>(SecName, Data);
+  Sec->OriginalOffset = ~0ULL;
+  Sections.push_back(std::move(Sec));
+}
+
 template <class ELFT> void ELFObject<ELFT>::sortSections() {
   // Put all sections in offset order. Maintain the ordering as closely as
   // possible while meeting that demand however.
diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/Object.h
index 9f98c04ad9bb..b04b0c1a6415 100644
--- a/tools/llvm-objcopy/Object.h
+++ b/tools/llvm-objcopy/Object.h
@@ -126,6 +126,20 @@ public:
   void writeSection(FileOutputBuffer &Out) const override;
 };
 
+class OwnedDataSection : public SectionBase {
+private:
+  std::vector<uint8_t> Data;
+
+public:
+  OwnedDataSection(StringRef SecName, ArrayRef<uint8_t> Data)
+      : Data(std::begin(Data), std::end(Data)) {
+    Name = SecName;
+    Type = ELF::SHT_PROGBITS;
+    Size = Data.size();
+  }
+  void writeSection(FileOutputBuffer &Out) const override;
+};
+
 // There are two types of string tables that can exist, dynamic and not dynamic.
 // In the dynamic case the string table is allocated. Changing a dynamic string
 // table would mean altering virtual addresses and thus the memory image. So
@@ -372,6 +386,7 @@ public:
   const SymbolTableSection *getSymTab() const { return SymbolTable; }
   const SectionBase *getSectionHeaderStrTab() const { return SectionNames; }
   void removeSections(std::function<bool(const SectionBase &)> ToRemove);
+  void addSection(StringRef SecName, ArrayRef<uint8_t> Data);
   virtual size_t totalSize() const = 0;
   virtual void finalize() = 0;
   virtual void write(FileOutputBuffer &Out) const = 0;
diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index 9d60ae426390..20ce93bb40e8 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -113,6 +113,10 @@ static cl::opt<std::string>
              cl::desc("Equivalent to extract-dwo on the input file to "
                       "<dwo-file>, then strip-dwo on the input file"),
              cl::value_desc("dwo-file"));
+static cl::list<std::string> AddSection(
+    "add-section",
+    cl::desc("Make a section named <section> with the contents of <file>."),
+    cl::value_desc("section=file"));
 
 using SectionPred = std::function<bool(const SectionBase &Sec)>;
 
@@ -174,7 +178,7 @@ template <class ELFT> void CopyBinary(const ELFObjectFile<ELFT> &ObjFile) {
     Obj = llvm::make_unique<ELFObject<ELFT>>(ObjFile);
 
   if (!SplitDWO.empty())
-    SplitDWOToFile<ELFT>(ObjFile, SplitDWO.getValue()); 
+    SplitDWOToFile<ELFT>(ObjFile, SplitDWO.getValue());
 
   SectionPred RemovePred = [](const SectionBase &) { return false; };
 
@@ -286,6 +290,22 @@ template <class ELFT> void CopyBinary(const ELFObjectFile<ELFT> &ObjFile) {
   }
 
   Obj->removeSections(RemovePred);
+
+  if (!AddSection.empty()) {
+    for (const auto &Flag : AddSection) {
+      auto SecPair = StringRef(Flag).split("=");
+      auto SecName = SecPair.first;
+      auto File = SecPair.second;
+      auto BufOrErr = MemoryBuffer::getFile(File);
+      if (!BufOrErr)
+        reportError(File, BufOrErr.getError());
+      auto Buf = std::move(*BufOrErr);
+      auto BufPtr = reinterpret_cast<const uint8_t *>(Buf->getBufferStart());
+      auto BufSize = Buf->getBufferSize();
+      Obj->addSection(SecName, ArrayRef<uint8_t>(BufPtr, BufSize));
+    }
+  }
+
   Obj->finalize();
   WriteObjectFile(*Obj, OutputFilename.getValue());
 }
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 9678667abffe..f2b936904393 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -77,6 +77,7 @@ using namespace ELF;
 
 #define TYPEDEF_ELF_TYPES(ELFT)                                                \
   using ELFO = ELFFile<ELFT>;                                                  \
+  using Elf_Addr = typename ELFO::Elf_Addr;                                    \
   using Elf_Shdr = typename ELFO::Elf_Shdr;                                    \
   using Elf_Sym = typename ELFO::Elf_Sym;                                      \
   using Elf_Dyn = typename ELFO::Elf_Dyn;                                      \
@@ -249,6 +250,9 @@ public:
   Elf_Rela_Range dyn_relas() const;
   std::string getFullSymbolName(const Elf_Sym *Symbol, StringRef StrTable,
                                 bool IsDynamic) const;
+  void getSectionNameIndex(const Elf_Sym *Symbol, const Elf_Sym *FirstSym,
+                           StringRef &SectionName,
+                           unsigned &SectionIndex) const;
 
   void printSymbolsHelper(bool IsDynamic) const;
   const Elf_Shdr *getDotSymtabSec() const { return DotSymtabSec; }
@@ -287,6 +291,8 @@ void ELFDumper<ELFT>::printSymbolsHelper(bool IsDynamic) const {
     ELFDumperStyle->printSymbol(Obj, &Sym, Syms.begin(), StrTable, IsDynamic);
 }
 
+template <class ELFT> class MipsGOTParser;
+
 template <typename ELFT> class DumpStyle {
 public:
   using Elf_Shdr = typename ELFFile<ELFT>::Elf_Shdr;
@@ -310,6 +316,8 @@ public:
   virtual void printProgramHeaders(const ELFFile<ELFT> *Obj) = 0;
   virtual void printHashHistogram(const ELFFile<ELFT> *Obj) = 0;
   virtual void printNotes(const ELFFile<ELFT> *Obj) = 0;
+  virtual void printMipsGOT(const MipsGOTParser<ELFT> &Parser) = 0;
+  virtual void printMipsPLT(const MipsGOTParser<ELFT> &Parser) = 0;
   const ELFDumper<ELFT> *dumper() const { return Dumper; }
 
 private:
@@ -337,6 +345,8 @@ public:
   void printProgramHeaders(const ELFO *Obj) override;
   void printHashHistogram(const ELFFile<ELFT> *Obj) override;
   void printNotes(const ELFFile<ELFT> *Obj) override;
+  void printMipsGOT(const MipsGOTParser<ELFT> &Parser) override;
+  void printMipsPLT(const MipsGOTParser<ELFT> &Parser) override;
 
 private:
   struct Field {
@@ -395,6 +405,8 @@ public:
   void printProgramHeaders(const ELFO *Obj) override;
   void printHashHistogram(const ELFFile<ELFT> *Obj) override;
   void printNotes(const ELFFile<ELFT> *Obj) override;
+  void printMipsGOT(const MipsGOTParser<ELFT> &Parser) override;
+  void printMipsPLT(const MipsGOTParser<ELFT> &Parser) override;
 
 private:
   void printRelocation(const ELFO *Obj, Elf_Rela Rel, const Elf_Shdr *SymTab);
@@ -735,11 +747,10 @@ std::string ELFDumper<ELFT>::getFullSymbolName(const Elf_Sym *Symbol,
 }
 
 template <typename ELFT>
-static void
-getSectionNameIndex(const ELFFile<ELFT> &Obj, const typename ELFT::Sym *Symbol,
-                    const typename ELFT::Sym *FirstSym,
-                    ArrayRef<typename ELFT::Word> ShndxTable,
-                    StringRef &SectionName, unsigned &SectionIndex) {
+void ELFDumper<ELFT>::getSectionNameIndex(const Elf_Sym *Symbol,
+                                          const Elf_Sym *FirstSym,
+                                          StringRef &SectionName,
+                                          unsigned &SectionIndex) const {
   SectionIndex = Symbol->st_shndx;
   if (Symbol->isUndefined())
     SectionName = "Undefined";
@@ -758,8 +769,8 @@ getSectionNameIndex(const ELFFile<ELFT> &Obj, const typename ELFT::Sym *Symbol,
       SectionIndex = unwrapOrError(object::getExtendedSymbolTableIndex<ELFT>(
           Symbol, FirstSym, ShndxTable));
     const typename ELFT::Shdr *Sec =
-        unwrapOrError(Obj.getSection(SectionIndex));
-    SectionName = unwrapOrError(Obj.getSectionName(Sec));
+        unwrapOrError(Obj->getSection(SectionIndex));
+    SectionName = unwrapOrError(Obj->getSectionName(Sec));
   }
 }
 
@@ -1905,47 +1916,83 @@ template <> void ELFDumper<ELFType<support::little, false>>::printAttributes() {
 template <class ELFT> class MipsGOTParser {
 public:
   TYPEDEF_ELF_TYPES(ELFT)
-  using GOTEntry = typename ELFO::Elf_Addr;
+  using Entry = typename ELFO::Elf_Addr;
+  using Entries = ArrayRef<Entry>;
 
-  MipsGOTParser(ELFDumper<ELFT> *Dumper, const ELFO *Obj,
-                Elf_Dyn_Range DynTable, ScopedPrinter &W);
+  const bool IsStatic;
+  const ELFO * const Obj;
 
-  void parseStaticGOT();
-  void parseGOT();
-  void parsePLT();
+  MipsGOTParser(const ELFO *Obj, Elf_Dyn_Range DynTable, Elf_Sym_Range DynSyms);
 
-private:
-  ELFDumper<ELFT> *Dumper;
-  const ELFO *Obj;
-  ScopedPrinter &W;
-  Optional<uint64_t> DtPltGot;
-  Optional<uint64_t> DtLocalGotNum;
-  Optional<uint64_t> DtGotSym;
-  Optional<uint64_t> DtMipsPltGot;
-  Optional<uint64_t> DtJmpRel;
+  bool hasGot() const { return !GotEntries.empty(); }
+  bool hasPlt() const { return !PltEntries.empty(); }
+
+  uint64_t getGp() const;
+
+  const Entry *getGotLazyResolver() const;
+  const Entry *getGotModulePointer() const;
+  const Entry *getPltLazyResolver() const;
+  const Entry *getPltModulePointer() const;
+
+  Entries getLocalEntries() const;
+  Entries getGlobalEntries() const;
+  Entries getOtherEntries() const;
+  Entries getPltEntries() const;
+
+  uint64_t getGotAddress(const Entry * E) const;
+  int64_t getGotOffset(const Entry * E) const;
+  const Elf_Sym *getGotSym(const Entry *E) const;
+
+  uint64_t getPltAddress(const Entry * E) const;
+  const Elf_Sym *getPltSym(const Entry *E) const;
 
-  std::size_t getGOTTotal(ArrayRef<uint8_t> GOT) const;
-  const GOTEntry *makeGOTIter(ArrayRef<uint8_t> GOT, std::size_t EntryNum);
-
-  void printLocalGOT(const Elf_Shdr *GOTShdr, size_t Num);
-  void printGotEntry(uint64_t GotAddr, const GOTEntry *BeginIt,
-                     const GOTEntry *It);
-  void printGlobalGotEntry(uint64_t GotAddr, const GOTEntry *BeginIt,
-                           const GOTEntry *It, const Elf_Sym *Sym,
-                           StringRef StrTable);
-  void printPLTEntry(uint64_t PLTAddr, const GOTEntry *BeginIt,
-                     const GOTEntry *It, StringRef Purpose);
-  void printPLTEntry(uint64_t PLTAddr, const GOTEntry *BeginIt,
-                     const GOTEntry *It, StringRef StrTable,
-                     const Elf_Sym *Sym);
+  StringRef getPltStrTable() const { return PltStrTable; }
+
+private:
+  const Elf_Shdr *GotSec;
+  size_t LocalNum;
+  size_t GlobalNum;
+
+  const Elf_Shdr *PltSec;
+  const Elf_Shdr *PltRelSec;
+  const Elf_Shdr *PltSymTable;
+  Elf_Sym_Range GotDynSyms;
+  StringRef PltStrTable;
+
+  Entries GotEntries;
+  Entries PltEntries;
 };
 
 } // end anonymous namespace
 
 template <class ELFT>
-MipsGOTParser<ELFT>::MipsGOTParser(ELFDumper<ELFT> *Dumper, const ELFO *Obj,
-                                   Elf_Dyn_Range DynTable, ScopedPrinter &W)
-    : Dumper(Dumper), Obj(Obj), W(W) {
+MipsGOTParser<ELFT>::MipsGOTParser(const ELFO *Obj, Elf_Dyn_Range DynTable,
+                                   Elf_Sym_Range DynSyms)
+    : IsStatic(DynTable.empty()), Obj(Obj), GotSec(nullptr), LocalNum(0),
+      GlobalNum(0), PltSec(nullptr), PltRelSec(nullptr), PltSymTable(nullptr) {
+  // See "Global Offset Table" in Chapter 5 in the following document
+  // for detailed GOT description.
+  // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
+
+  // Find static GOT secton.
+  if (IsStatic) {
+    GotSec = findSectionByName(*Obj, ".got");
+    if (!GotSec)
+      reportError("Cannot find .got section");
+
+    ArrayRef<uint8_t> Content = unwrapOrError(Obj->getSectionContents(GotSec));
+    GotEntries = Entries(reinterpret_cast<const Entry *>(Content.data()),
+                         Content.size() / sizeof(Entry));
+    LocalNum = GotEntries.size();
+    return;
+  }
+
+  // Lookup dynamic table tags which define GOT/PLT layouts.
+  Optional<uint64_t> DtPltGot;
+  Optional<uint64_t> DtLocalGotNum;
+  Optional<uint64_t> DtGotSym;
+  Optional<uint64_t> DtMipsPltGot;
+  Optional<uint64_t> DtJmpRel;
   for (const auto &Entry : DynTable) {
     switch (Entry.getTag()) {
     case ELF::DT_PLTGOT:
@@ -1965,261 +2012,175 @@ MipsGOTParser<ELFT>::MipsGOTParser(ELFDumper<ELFT> *Dumper, const ELFO *Obj,
       break;
     }
   }
-}
-
-template <class ELFT>
-void MipsGOTParser<ELFT>::printLocalGOT(const Elf_Shdr *GOTShdr, size_t Num) {
-  ArrayRef<uint8_t> GOT = unwrapOrError(Obj->getSectionContents(GOTShdr));
-
-  const GOTEntry *GotBegin = makeGOTIter(GOT, 0);
-  const GOTEntry *GotEnd = makeGOTIter(GOT, Num);
-  const GOTEntry *It = GotBegin;
-
-  W.printHex("Canonical gp value", GOTShdr->sh_addr + 0x7ff0);
-  {
-    ListScope RS(W, "Reserved entries");
-
-    {
-      DictScope D(W, "Entry");
-      printGotEntry(GOTShdr->sh_addr, GotBegin, It++);
-      W.printString("Purpose", StringRef("Lazy resolver"));
-    }
 
-    if (It != GotEnd && (*It >> (sizeof(GOTEntry) * 8 - 1)) != 0) {
-      DictScope D(W, "Entry");
-      printGotEntry(GOTShdr->sh_addr, GotBegin, It++);
-      W.printString("Purpose", StringRef("Module pointer (GNU extension)"));
-    }
+  // Find dynamic GOT section.
+  if (DtPltGot || DtLocalGotNum || DtGotSym) {
+    if (!DtPltGot)
+      report_fatal_error("Cannot find PLTGOT dynamic table tag.");
+    if (!DtLocalGotNum)
+      report_fatal_error("Cannot find MIPS_LOCAL_GOTNO dynamic table tag.");
+    if (!DtGotSym)
+      report_fatal_error("Cannot find MIPS_GOTSYM dynamic table tag.");
+
+    size_t DynSymTotal = DynSyms.size();
+    if (*DtGotSym > DynSymTotal)
+      reportError("MIPS_GOTSYM exceeds a number of dynamic symbols");
+
+    GotSec = findNotEmptySectionByAddress(Obj, *DtPltGot);
+    if (!GotSec)
+      reportError("There is no not empty GOT section at 0x" +
+                  Twine::utohexstr(*DtPltGot));
+
+    LocalNum = *DtLocalGotNum;
+    GlobalNum = DynSymTotal - *DtGotSym;
+
+    ArrayRef<uint8_t> Content = unwrapOrError(Obj->getSectionContents(GotSec));
+    GotEntries = Entries(reinterpret_cast<const Entry *>(Content.data()),
+                         Content.size() / sizeof(Entry));
+    GotDynSyms = DynSyms.drop_front(*DtGotSym);
   }
-  {
-    ListScope LS(W, "Local entries");
-    for (; It != GotEnd; ++It) {
-      DictScope D(W, "Entry");
-      printGotEntry(GOTShdr->sh_addr, GotBegin, It);
-    }
-  }
-}
 
-template <class ELFT> void MipsGOTParser<ELFT>::parseStaticGOT() {
-  const Elf_Shdr *GOTShdr = findSectionByName(*Obj, ".got");
-  if (!GOTShdr) {
-    W.startLine() << "Cannot find .got section.\n";
-    return;
+  // Find PLT section.
+  if (DtMipsPltGot || DtJmpRel) {
+    if (!DtMipsPltGot)
+      report_fatal_error("Cannot find MIPS_PLTGOT dynamic table tag.");
+    if (!DtJmpRel)
+      report_fatal_error("Cannot find JMPREL dynamic table tag.");
+
+    PltSec = findNotEmptySectionByAddress(Obj, *DtMipsPltGot);
+    if (!PltSec)
+      report_fatal_error("There is no not empty PLTGOT section at 0x " +
+                         Twine::utohexstr(*DtMipsPltGot));
+
+    PltRelSec = findNotEmptySectionByAddress(Obj, *DtJmpRel);
+    if (!PltRelSec)
+      report_fatal_error("There is no not empty RELPLT section at 0x" +
+                         Twine::utohexstr(*DtJmpRel));
+
+    ArrayRef<uint8_t> PltContent =
+        unwrapOrError(Obj->getSectionContents(PltSec));
+    PltEntries = Entries(reinterpret_cast<const Entry *>(PltContent.data()),
+                         PltContent.size() / sizeof(Entry));
+
+    PltSymTable = unwrapOrError(Obj->getSection(PltRelSec->sh_link));
+    PltStrTable = unwrapOrError(Obj->getStringTableForSymtab(*PltSymTable));
   }
-
-  DictScope GS(W, "Static GOT");
-  printLocalGOT(GOTShdr, GOTShdr->sh_size / sizeof(GOTEntry));
 }
 
-template <class ELFT> void MipsGOTParser<ELFT>::parseGOT() {
-  // See "Global Offset Table" in Chapter 5 in the following document
-  // for detailed GOT description.
-  // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
-  if (!DtPltGot) {
-    W.startLine() << "Cannot find PLTGOT dynamic table tag.\n";
-    return;
-  }
-  if (!DtLocalGotNum) {
-    W.startLine() << "Cannot find MIPS_LOCAL_GOTNO dynamic table tag.\n";
-    return;
-  }
-  if (!DtGotSym) {
-    W.startLine() << "Cannot find MIPS_GOTSYM dynamic table tag.\n";
-    return;
-  }
-
-  std::size_t DynSymTotal = Dumper->dynamic_symbols().size();
-
-  if (*DtGotSym > DynSymTotal)
-    report_fatal_error("MIPS_GOTSYM exceeds a number of dynamic symbols");
-
-  std::size_t GlobalGotNum = DynSymTotal - *DtGotSym;
-
-  if (*DtLocalGotNum + GlobalGotNum == 0) {
-    W.startLine() << "GOT is empty.\n";
-    return;
-  }
-
-  const Elf_Shdr *GOTShdr = findNotEmptySectionByAddress(Obj, *DtPltGot);
-  if (!GOTShdr)
-    report_fatal_error("There is no not empty GOT section at 0x" +
-                       Twine::utohexstr(*DtPltGot));
-
-  ArrayRef<uint8_t> GOT = unwrapOrError(Obj->getSectionContents(GOTShdr));
-
-  if (*DtLocalGotNum + GlobalGotNum > getGOTTotal(GOT))
-    report_fatal_error("Number of GOT entries exceeds the size of GOT section");
-
-  DictScope GS(W, "Primary GOT");
-  printLocalGOT(GOTShdr, *DtLocalGotNum);
-
-  {
-    ListScope GS(W, "Global entries");
-
-    const GOTEntry *GotBegin = makeGOTIter(GOT, 0);
-    const GOTEntry *GotEnd = makeGOTIter(GOT, *DtLocalGotNum + GlobalGotNum);
-    const Elf_Sym *GotDynSym = Dumper->dynamic_symbols().begin() + *DtGotSym;
-    for (auto It = makeGOTIter(GOT, *DtLocalGotNum); It != GotEnd; ++It) {
-      DictScope D(W, "Entry");
-      printGlobalGotEntry(GOTShdr->sh_addr, GotBegin, It, GotDynSym++,
-                          Dumper->getDynamicStringTable());
-    }
-  }
-
-  std::size_t SpecGotNum = getGOTTotal(GOT) - *DtLocalGotNum - GlobalGotNum;
-  W.printNumber("Number of TLS and multi-GOT entries", uint64_t(SpecGotNum));
+template <class ELFT> uint64_t MipsGOTParser<ELFT>::getGp() const {
+  return GotSec->sh_addr + 0x7ff0;
 }
 
-template <class ELFT> void MipsGOTParser<ELFT>::parsePLT() {
-  if (!DtMipsPltGot) {
-    W.startLine() << "Cannot find MIPS_PLTGOT dynamic table tag.\n";
-    return;
-  }
-  if (!DtJmpRel) {
-    W.startLine() << "Cannot find JMPREL dynamic table tag.\n";
-    return;
-  }
-
-  const Elf_Shdr *PLTShdr = findNotEmptySectionByAddress(Obj, *DtMipsPltGot);
-  if (!PLTShdr)
-    report_fatal_error("There is no not empty PLTGOT section at 0x " +
-                       Twine::utohexstr(*DtMipsPltGot));
-  ArrayRef<uint8_t> PLT = unwrapOrError(Obj->getSectionContents(PLTShdr));
-
-  const Elf_Shdr *PLTRelShdr = findNotEmptySectionByAddress(Obj, *DtJmpRel);
-  if (!PLTRelShdr)
-    report_fatal_error("There is no not empty RELPLT section at 0x" +
-                       Twine::utohexstr(*DtJmpRel));
-  const Elf_Shdr *SymTable =
-      unwrapOrError(Obj->getSection(PLTRelShdr->sh_link));
-  StringRef StrTable = unwrapOrError(Obj->getStringTableForSymtab(*SymTable));
-
-  const GOTEntry *PLTBegin = makeGOTIter(PLT, 0);
-  const GOTEntry *PLTEnd = makeGOTIter(PLT, getGOTTotal(PLT));
-  const GOTEntry *It = PLTBegin;
-
-  DictScope GS(W, "PLT GOT");
-  {
-    ListScope RS(W, "Reserved entries");
-    printPLTEntry(PLTShdr->sh_addr, PLTBegin, It++, "PLT lazy resolver");
-    if (It != PLTEnd)
-      printPLTEntry(PLTShdr->sh_addr, PLTBegin, It++, "Module pointer");
-  }
-  {
-    ListScope GS(W, "Entries");
-
-    switch (PLTRelShdr->sh_type) {
-    case ELF::SHT_REL:
-      for (const Elf_Rel &Rel : unwrapOrError(Obj->rels(PLTRelShdr))) {
-        const Elf_Sym *Sym =
-            unwrapOrError(Obj->getRelocationSymbol(&Rel, SymTable));
-        printPLTEntry(PLTShdr->sh_addr, PLTBegin, It, StrTable, Sym);
-        if (++It == PLTEnd)
-          break;
-      }
-      break;
-    case ELF::SHT_RELA:
-      for (const Elf_Rela &Rel : unwrapOrError(Obj->relas(PLTRelShdr))) {
-        const Elf_Sym *Sym =
-            unwrapOrError(Obj->getRelocationSymbol(&Rel, SymTable));
-        printPLTEntry(PLTShdr->sh_addr, PLTBegin, It, StrTable, Sym);
-        if (++It == PLTEnd)
-          break;
-      }
-      break;
-    }
-  }
+template <class ELFT>
+const typename MipsGOTParser<ELFT>::Entry *
+MipsGOTParser<ELFT>::getGotLazyResolver() const {
+  return LocalNum > 0 ? &GotEntries[0] : nullptr;
 }
 
 template <class ELFT>
-std::size_t MipsGOTParser<ELFT>::getGOTTotal(ArrayRef<uint8_t> GOT) const {
-  return GOT.size() / sizeof(GOTEntry);
+const typename MipsGOTParser<ELFT>::Entry *
+MipsGOTParser<ELFT>::getGotModulePointer() const {
+  if (LocalNum < 2)
+    return nullptr;
+  const Entry &E = GotEntries[1];
+  if ((E >> (sizeof(Entry) * 8 - 1)) == 0)
+    return nullptr;
+  return &E;
 }
 
 template <class ELFT>
-const typename MipsGOTParser<ELFT>::GOTEntry *
-MipsGOTParser<ELFT>::makeGOTIter(ArrayRef<uint8_t> GOT, std::size_t EntryNum) {
-  const char *Data = reinterpret_cast<const char *>(GOT.data());
-  return reinterpret_cast<const GOTEntry *>(Data + EntryNum * sizeof(GOTEntry));
+typename MipsGOTParser<ELFT>::Entries
+MipsGOTParser<ELFT>::getLocalEntries() const {
+  size_t Skip = getGotModulePointer() ? 2 : 1;
+  if (LocalNum - Skip <= 0)
+    return Entries();
+  return GotEntries.slice(Skip, LocalNum - Skip);
 }
 
 template <class ELFT>
-void MipsGOTParser<ELFT>::printGotEntry(uint64_t GotAddr,
-                                        const GOTEntry *BeginIt,
-                                        const GOTEntry *It) {
-  int64_t Offset = std::distance(BeginIt, It) * sizeof(GOTEntry);
-  W.printHex("Address", GotAddr + Offset);
-  W.printNumber("Access", Offset - 0x7ff0);
-  W.printHex("Initial", *It);
+typename MipsGOTParser<ELFT>::Entries
+MipsGOTParser<ELFT>::getGlobalEntries() const {
+  if (GlobalNum == 0)
+    return Entries();
+  return GotEntries.slice(LocalNum, GlobalNum);
 }
 
 template <class ELFT>
-void MipsGOTParser<ELFT>::printGlobalGotEntry(uint64_t GotAddr,
-                                              const GOTEntry *BeginIt,
-                                              const GOTEntry *It,
-                                              const Elf_Sym *Sym,
-                                              StringRef StrTable) {
-  printGotEntry(GotAddr, BeginIt, It);
+typename MipsGOTParser<ELFT>::Entries
+MipsGOTParser<ELFT>::getOtherEntries() const {
+  size_t OtherNum = GotEntries.size() - LocalNum - GlobalNum;
+  if (OtherNum == 0)
+    return Entries();
+  return GotEntries.slice(LocalNum + GlobalNum, OtherNum);
+}
 
-  W.printHex("Value", Sym->st_value);
-  W.printEnum("Type", Sym->getType(), makeArrayRef(ElfSymbolTypes));
+template <class ELFT>
+uint64_t MipsGOTParser<ELFT>::getGotAddress(const Entry *E) const {
+  int64_t Offset = std::distance(GotEntries.data(), E) * sizeof(Entry);
+  return GotSec->sh_addr + Offset;
+}
 
-  unsigned SectionIndex = 0;
-  StringRef SectionName;
-  getSectionNameIndex(*Obj, Sym, Dumper->dynamic_symbols().begin(),
-                      Dumper->getShndxTable(), SectionName, SectionIndex);
-  W.printHex("Section", SectionName, SectionIndex);
+template <class ELFT>
+int64_t MipsGOTParser<ELFT>::getGotOffset(const Entry *E) const {
+  int64_t Offset = std::distance(GotEntries.data(), E) * sizeof(Entry);
+  return Offset - 0x7ff0;
+}
 
-  std::string FullSymbolName = Dumper->getFullSymbolName(Sym, StrTable, true);
-  W.printNumber("Name", FullSymbolName, Sym->st_name);
+template <class ELFT>
+const typename MipsGOTParser<ELFT>::Elf_Sym *
+MipsGOTParser<ELFT>::getGotSym(const Entry *E) const {
+  int64_t Offset = std::distance(GotEntries.data(), E);
+  return &GotDynSyms[Offset - LocalNum];
 }
 
 template <class ELFT>
-void MipsGOTParser<ELFT>::printPLTEntry(uint64_t PLTAddr,
-                                        const GOTEntry *BeginIt,
-                                        const GOTEntry *It, StringRef Purpose) {
-  DictScope D(W, "Entry");
-  int64_t Offset = std::distance(BeginIt, It) * sizeof(GOTEntry);
-  W.printHex("Address", PLTAddr + Offset);
-  W.printHex("Initial", *It);
-  W.printString("Purpose", Purpose);
+const typename MipsGOTParser<ELFT>::Entry *
+MipsGOTParser<ELFT>::getPltLazyResolver() const {
+  return PltEntries.empty() ? nullptr : &PltEntries[0];
 }
 
 template <class ELFT>
-void MipsGOTParser<ELFT>::printPLTEntry(uint64_t PLTAddr,
-                                        const GOTEntry *BeginIt,
-                                        const GOTEntry *It, StringRef StrTable,
-                                        const Elf_Sym *Sym) {
-  DictScope D(W, "Entry");
-  int64_t Offset = std::distance(BeginIt, It) * sizeof(GOTEntry);
-  W.printHex("Address", PLTAddr + Offset);
-  W.printHex("Initial", *It);
-  W.printHex("Value", Sym->st_value);
-  W.printEnum("Type", Sym->getType(), makeArrayRef(ElfSymbolTypes));
+const typename MipsGOTParser<ELFT>::Entry *
+MipsGOTParser<ELFT>::getPltModulePointer() const {
+  return PltEntries.size() < 2 ? nullptr : &PltEntries[1];
+}
 
-  unsigned SectionIndex = 0;
-  StringRef SectionName;
-  getSectionNameIndex(*Obj, Sym, Dumper->dynamic_symbols().begin(),
-                      Dumper->getShndxTable(), SectionName, SectionIndex);
-  W.printHex("Section", SectionName, SectionIndex);
+template <class ELFT>
+typename MipsGOTParser<ELFT>::Entries
+MipsGOTParser<ELFT>::getPltEntries() const {
+  if (PltEntries.size() <= 2)
+    return Entries();
+  return PltEntries.slice(2, PltEntries.size() - 2);
+}
 
-  std::string FullSymbolName = Dumper->getFullSymbolName(Sym, StrTable, true);
-  W.printNumber("Name", FullSymbolName, Sym->st_name);
+template <class ELFT>
+uint64_t MipsGOTParser<ELFT>::getPltAddress(const Entry *E) const {
+  int64_t Offset = std::distance(PltEntries.data(), E) * sizeof(Entry);
+  return PltSec->sh_addr + Offset;
 }
 
-template <class ELFT> void ELFDumper<ELFT>::printMipsPLTGOT() {
-  if (Obj->getHeader()->e_machine != EM_MIPS) {
-    W.startLine() << "MIPS PLT GOT is available for MIPS targets only.\n";
-    return;
+template <class ELFT>
+const typename MipsGOTParser<ELFT>::Elf_Sym *
+MipsGOTParser<ELFT>::getPltSym(const Entry *E) const {
+  int64_t Offset = std::distance(getPltEntries().data(), E);
+  if (PltRelSec->sh_type == ELF::SHT_REL) {
+    Elf_Rel_Range Rels = unwrapOrError(Obj->rels(PltRelSec));
+    return unwrapOrError(Obj->getRelocationSymbol(&Rels[Offset], PltSymTable));
+  } else {
+    Elf_Rela_Range Rels = unwrapOrError(Obj->relas(PltRelSec));
+    return unwrapOrError(Obj->getRelocationSymbol(&Rels[Offset], PltSymTable));
   }
+}
 
-  MipsGOTParser<ELFT> GOTParser(this, Obj, dynamic_table(), W);
-  if (dynamic_table().empty())
-    GOTParser.parseStaticGOT();
-  else {
-    GOTParser.parseGOT();
-    GOTParser.parsePLT();
-  }
+template <class ELFT> void ELFDumper<ELFT>::printMipsPLTGOT() {
+  if (Obj->getHeader()->e_machine != EM_MIPS)
+    reportError("MIPS PLT GOT is available for MIPS targets only");
+
+  MipsGOTParser<ELFT> Parser(Obj, dynamic_table(), dynamic_symbols());
+  if (Parser.hasGot())
+    ELFDumperStyle->printMipsGOT(Parser);
+  if (Parser.hasPlt())
+    ELFDumperStyle->printMipsPLT(Parser);
 }
 
 static const EnumEntry<unsigned> ElfMipsISAExtType[] = {
@@ -3597,6 +3558,119 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
   }
 }
 
+template <class ELFT>
+void GNUStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
+  size_t Bias = ELFT::Is64Bits ? 8 : 0;
+  auto PrintEntry = [&](const Elf_Addr *E, StringRef Purpose) {
+    OS.PadToColumn(2);
+    OS << format_hex_no_prefix(Parser.getGotAddress(E), 8 + Bias);
+    OS.PadToColumn(11 + Bias);
+    OS << format_decimal(Parser.getGotOffset(E), 6) << "(gp)";
+    OS.PadToColumn(22 + Bias);
+    OS << format_hex_no_prefix(*E, 8 + Bias);
+    OS.PadToColumn(31 + 2 * Bias);
+    OS << Purpose << "\n";
+  };
+
+  OS << (Parser.IsStatic ? "Static GOT:\n" : "Primary GOT:\n");
+  OS << " Canonical gp value: "
+     << format_hex_no_prefix(Parser.getGp(), 8 + Bias) << "\n\n";
+
+  OS << " Reserved entries:\n";
+  OS << "   Address     Access  Initial Purpose\n";
+  PrintEntry(Parser.getGotLazyResolver(), "Lazy resolver");
+  if (Parser.getGotModulePointer())
+    PrintEntry(Parser.getGotModulePointer(), "Module pointer (GNU extension)");
+
+  if (!Parser.getLocalEntries().empty()) {
+    OS << "\n";
+    OS << " Local entries:\n";
+    OS << "   Address     Access  Initial\n";
+    for (auto &E : Parser.getLocalEntries())
+      PrintEntry(&E, "");
+  }
+
+  if (Parser.IsStatic)
+    return;
+
+  if (!Parser.getGlobalEntries().empty()) {
+    OS << "\n";
+    OS << " Global entries:\n";
+    OS << "   Address     Access  Initial Sym.Val. Type    Ndx Name\n";
+    for (auto &E : Parser.getGlobalEntries()) {
+      const Elf_Sym *Sym = Parser.getGotSym(&E);
+      std::string SymName = this->dumper()->getFullSymbolName(
+          Sym, this->dumper()->getDynamicStringTable(), false);
+
+      OS.PadToColumn(2);
+      OS << to_string(format_hex_no_prefix(Parser.getGotAddress(&E), 8 + Bias));
+      OS.PadToColumn(11 + Bias);
+      OS << to_string(format_decimal(Parser.getGotOffset(&E), 6)) + "(gp)";
+      OS.PadToColumn(22 + Bias);
+      OS << to_string(format_hex_no_prefix(E, 8 + Bias));
+      OS.PadToColumn(31 + 2 * Bias);
+      OS << to_string(format_hex_no_prefix(Sym->st_value, 8 + Bias));
+      OS.PadToColumn(40 + 3 * Bias);
+      OS << printEnum(Sym->getType(), makeArrayRef(ElfSymbolTypes));
+      OS.PadToColumn(48 + 3 * Bias);
+      OS << getSymbolSectionNdx(Parser.Obj, Sym,
+                                this->dumper()->dynamic_symbols().begin());
+      OS.PadToColumn(52 + 3 * Bias);
+      OS << SymName << "\n";
+    }
+  }
+
+  if (!Parser.getOtherEntries().empty())
+    OS << "\n Number of TLS and multi-GOT entries "
+       << Parser.getOtherEntries().size() << "\n";
+}
+
+template <class ELFT>
+void GNUStyle<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
+  size_t Bias = ELFT::Is64Bits ? 8 : 0;
+  auto PrintEntry = [&](const Elf_Addr *E, StringRef Purpose) {
+    OS.PadToColumn(2);
+    OS << format_hex_no_prefix(Parser.getGotAddress(E), 8 + Bias);
+    OS.PadToColumn(11 + Bias);
+    OS << format_hex_no_prefix(*E, 8 + Bias);
+    OS.PadToColumn(20 + 2 * Bias);
+    OS << Purpose << "\n";
+  };
+
+  OS << "PLT GOT:\n\n";
+
+  OS << " Reserved entries:\n";
+  OS << "   Address  Initial Purpose\n";
+  PrintEntry(Parser.getPltLazyResolver(), "PLT lazy resolver");
+  if (Parser.getPltModulePointer())
+    PrintEntry(Parser.getGotModulePointer(), "Module pointer");
+
+  if (!Parser.getPltEntries().empty()) {
+    OS << "\n";
+    OS << " Entries:\n";
+    OS << "   Address  Initial Sym.Val. Type    Ndx Name\n";
+    for (auto &E : Parser.getPltEntries()) {
+      const Elf_Sym *Sym = Parser.getPltSym(&E);
+      std::string SymName = this->dumper()->getFullSymbolName(
+          Sym, this->dumper()->getDynamicStringTable(), false);
+
+      OS.PadToColumn(2);
+      OS << to_string(format_hex_no_prefix(Parser.getGotAddress(&E), 8 + Bias));
+      OS.PadToColumn(11 + Bias);
+      OS << to_string(format_hex_no_prefix(E, 8 + Bias));
+      OS.PadToColumn(20 + 2 * Bias);
+      OS << to_string(format_hex_no_prefix(Sym->st_value, 8 + Bias));
+      OS.PadToColumn(29 + 3 * Bias);
+      OS << printEnum(Sym->getType(), makeArrayRef(ElfSymbolTypes));
+      OS.PadToColumn(37 + 3 * Bias);
+      OS << getSymbolSectionNdx(Parser.Obj, Sym,
+                                this->dumper()->dynamic_symbols().begin());
+      OS.PadToColumn(41 + 3 * Bias);
+      OS << SymName << "\n";
+    }
+  }
+}
+
 template <class ELFT> void LLVMStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
   const Elf_Ehdr *e = Obj->getHeader();
   {
@@ -3854,8 +3928,7 @@ void LLVMStyle<ELFT>::printSymbol(const ELFO *Obj, const Elf_Sym *Symbol,
                                   bool IsDynamic) {
   unsigned SectionIndex = 0;
   StringRef SectionName;
-  getSectionNameIndex(*Obj, Symbol, First, this->dumper()->getShndxTable(),
-                      SectionName, SectionIndex);
+  this->dumper()->getSectionNameIndex(Symbol, First, SectionName, SectionIndex);
   std::string FullSymbolName =
       this->dumper()->getFullSymbolName(Symbol, StrTable, IsDynamic);
   unsigned char SymbolType = Symbol->getType();
@@ -3992,3 +4065,114 @@ template <class ELFT>
 void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
   W.startLine() << "printNotes not implemented!\n";
 }
+
+template <class ELFT>
+void LLVMStyle<ELFT>::printMipsGOT(const MipsGOTParser<ELFT> &Parser) {
+  auto PrintEntry = [&](const Elf_Addr *E) {
+    W.printHex("Address", Parser.getGotAddress(E));
+    W.printNumber("Access", Parser.getGotOffset(E));
+    W.printHex("Initial", *E);
+  };
+
+  DictScope GS(W, Parser.IsStatic ? "Static GOT" : "Primary GOT");
+
+  W.printHex("Canonical gp value", Parser.getGp());
+  {
+    ListScope RS(W, "Reserved entries");
+    {
+      DictScope D(W, "Entry");
+      PrintEntry(Parser.getGotLazyResolver());
+      W.printString("Purpose", StringRef("Lazy resolver"));
+    }
+
+    if (Parser.getGotModulePointer()) {
+      DictScope D(W, "Entry");
+      PrintEntry(Parser.getGotModulePointer());
+      W.printString("Purpose", StringRef("Module pointer (GNU extension)"));
+    }
+  }
+  {
+    ListScope LS(W, "Local entries");
+    for (auto &E : Parser.getLocalEntries()) {
+      DictScope D(W, "Entry");
+      PrintEntry(&E);
+    }
+  }
+
+  if (Parser.IsStatic)
+    return;
+
+  {
+    ListScope GS(W, "Global entries");
+    for (auto &E : Parser.getGlobalEntries()) {
+      DictScope D(W, "Entry");
+
+      PrintEntry(&E);
+
+      const Elf_Sym *Sym = Parser.getGotSym(&E);
+      W.printHex("Value", Sym->st_value);
+      W.printEnum("Type", Sym->getType(), makeArrayRef(ElfSymbolTypes));
+
+      unsigned SectionIndex = 0;
+      StringRef SectionName;
+      this->dumper()->getSectionNameIndex(
+          Sym, this->dumper()->dynamic_symbols().begin(), SectionName,
+          SectionIndex);
+      W.printHex("Section", SectionName, SectionIndex);
+
+      std::string SymName = this->dumper()->getFullSymbolName(
+          Sym, this->dumper()->getDynamicStringTable(), true);
+      W.printNumber("Name", SymName, Sym->st_name);
+    }
+  }
+
+  W.printNumber("Number of TLS and multi-GOT entries",
+                uint64_t(Parser.getOtherEntries().size()));
+}
+
+template <class ELFT>
+void LLVMStyle<ELFT>::printMipsPLT(const MipsGOTParser<ELFT> &Parser) {
+  auto PrintEntry = [&](const Elf_Addr *E) {
+    W.printHex("Address", Parser.getPltAddress(E));
+    W.printHex("Initial", *E);
+  };
+
+  DictScope GS(W, "PLT GOT");
+
+  {
+    ListScope RS(W, "Reserved entries");
+    {
+      DictScope D(W, "Entry");
+      PrintEntry(Parser.getPltLazyResolver());
+      W.printString("Purpose", StringRef("PLT lazy resolver"));
+    }
+
+    if (auto E = Parser.getPltModulePointer()) {
+      DictScope D(W, "Entry");
+      PrintEntry(E);
+      W.printString("Purpose", StringRef("Module pointer"));
+    }
+  }
+  {
+    ListScope LS(W, "Entries");
+    for (auto &E : Parser.getPltEntries()) {
+      DictScope D(W, "Entry");
+      PrintEntry(&E);
+
+      const Elf_Sym *Sym = Parser.getPltSym(&E);
+      W.printHex("Value", Sym->st_value);
+      W.printEnum("Type", Sym->getType(), makeArrayRef(ElfSymbolTypes));
+
+      unsigned SectionIndex = 0;
+      StringRef SectionName;
+      this->dumper()->getSectionNameIndex(
+          Sym, this->dumper()->dynamic_symbols().begin(), SectionName,
+          SectionIndex);
+      W.printHex("Section", SectionName, SectionIndex);
+
+      std::string SymName =
+          this->dumper()->getFullSymbolName(Sym, Parser.getPltStrTable(), true);
+      W.printNumber("Name", SymName, Sym->st_name);
+    }
+  }
+}
diff --git a/tools/llvm-readobj/WasmDumper.cpp b/tools/llvm-readobj/WasmDumper.cpp
index 77711e749aa0..223c1c752469 100644
--- a/tools/llvm-readobj/WasmDumper.cpp
+++ b/tools/llvm-readobj/WasmDumper.cpp
@@ -100,8 +100,8 @@ void WasmDumper::printRelocation(const SectionRef &Section,
       W.printNumber("Addend", WasmReloc.Addend);
   } else {
     raw_ostream& OS = W.startLine();
-    OS << W.hex(Reloc.getOffset())
-       << " " << RelocTypeName << "[" << WasmReloc.Index << "]";
+    OS << W.hex(Reloc.getOffset()) << " " << RelocTypeName << "["
+       << WasmReloc.Index << "]";
     if (HasAddend)
       OS << " " << WasmReloc.Addend;
     OS << "\n";
@@ -156,6 +156,12 @@ void WasmDumper::printSections() {
       if (WasmSec.Name == "linking") {
         const wasm::WasmLinkingData &LinkingData = Obj->linkingData();
         W.printNumber("DataSize", LinkingData.DataSize);
+        if (!LinkingData.InitFunctions.empty()) {
+          ListScope Group(W, "InitFunctions");
+          for (const wasm::WasmInitFunc &F: LinkingData.InitFunctions)
+            W.startLine() << F.FunctionIndex << " (priority=" << F.Priority
+                          << ")\n";
+        }
       }
       break;
     case wasm::WASM_SEC_DATA: {
diff --git a/tools/opt-viewer/optrecord.py b/tools/opt-viewer/optrecord.py
index 54e791192531..f79e05124b29 100644
--- a/tools/opt-viewer/optrecord.py
+++ b/tools/opt-viewer/optrecord.py
@@ -155,7 +155,7 @@ class Remark(yaml.YAMLObject):
         assert(len(mapping) == 1)
         (key, value) = list(mapping.items())[0]
 
-        if key == 'Caller' or key == 'Callee':
+        if key == 'Caller' or key == 'Callee' or key == 'DirectCallee':
             value = cgi.escape(self.demangle(value))
 
         if dl and key != 'Caller':
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index 84fb6fad1566..8b88c123b197 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -849,6 +849,23 @@ TEST(APFloatTest, fromDecimalString) {
   EXPECT_EQ(2.71828, convertToDoubleFromString("2.71828"));
 }
 
+TEST(APFloatTest, fromToStringSpecials) {
+  auto expects = [] (const char *first, const char *second) {
+    std::string roundtrip = convertToString(convertToDoubleFromString(second), 0, 3);
+    EXPECT_STREQ(first, roundtrip.c_str());
+  };
+  expects("+Inf", "+Inf");
+  expects("+Inf", "INFINITY");
+  expects("+Inf", "inf");
+  expects("-Inf", "-Inf");
+  expects("-Inf", "-INFINITY");
+  expects("-Inf", "-inf");
+  expects("NaN", "NaN");
+  expects("NaN", "nan");
+  expects("NaN", "-NaN");
+  expects("NaN", "-nan");
+}
+
 TEST(APFloatTest, fromHexadecimalString) {
   EXPECT_EQ( 1.0, APFloat(APFloat::IEEEdouble(),  "0x1p0").convertToDouble());
   EXPECT_EQ(+1.0, APFloat(APFloat::IEEEdouble(), "+0x1p0").convertToDouble());
diff --git a/unittests/ADT/StringRefTest.cpp b/unittests/ADT/StringRefTest.cpp
index 0684afe678f6..0e0b5957f025 100644
--- a/unittests/ADT/StringRefTest.cpp
+++ b/unittests/ADT/StringRefTest.cpp
@@ -875,7 +875,12 @@ struct GetDoubleStrings {
                      {"0.0", false, false, 0.0},
                      {"-0.0", false, false, -0.0},
                      {"123.45", false, true, 123.45},
-                     {"123.45", true, false, 123.45}};
+                     {"123.45", true, false, 123.45},
+                     {"1.8e308", true, false, std::numeric_limits<double>::infinity()},
+                     {"1.8e308", false, true, std::numeric_limits<double>::infinity()},
+                     {"0x0.0000000000001P-1023", false, true, 0.0},
+                     {"0x0.0000000000001P-1023", true, false, 0.0},
+                    };
 
 TEST(StringRefTest, getAsDouble) {
   for (const auto &Entry : DoubleStrings) {
diff --git a/unittests/CodeGen/MachineOperandTest.cpp b/unittests/CodeGen/MachineOperandTest.cpp
index e51207b95716..78a20b836486 100644
--- a/unittests/CodeGen/MachineOperandTest.cpp
+++ b/unittests/CodeGen/MachineOperandTest.cpp
@@ -10,6 +10,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
@@ -336,4 +337,66 @@ TEST(MachineOperandTest, PrintMCSymbol) {
   ASSERT_TRUE(OS.str() == "<mcsymbol foo>");
 }
 
+TEST(MachineOperandTest, PrintCFI) {
+  // Create a MachineOperand with a CFI index but no function and print it.
+  MachineOperand MO = MachineOperand::CreateCFIIndex(8);
+
+  // Checking some preconditions on the newly created
+  // MachineOperand.
+  ASSERT_TRUE(MO.isCFIIndex());
+  ASSERT_TRUE(MO.getCFIIndex() == 8);
+
+  std::string str;
+  // Print a MachineOperand containing a CFI Index node but no machine function
+  // attached to it.
+  raw_string_ostream OS(str);
+  MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
+  ASSERT_TRUE(OS.str() == "<cfi directive>");
+}
+
+TEST(MachineOperandTest, PrintIntrinsicID) {
+  // Create a MachineOperand with a generic intrinsic ID.
+  MachineOperand MO = MachineOperand::CreateIntrinsicID(Intrinsic::bswap);
+
+  // Checking some preconditions on the newly created
+  // MachineOperand.
+  ASSERT_TRUE(MO.isIntrinsicID());
+  ASSERT_TRUE(MO.getIntrinsicID() == Intrinsic::bswap);
+
+  std::string str;
+  {
+    // Print a MachineOperand containing a generic intrinsic ID.
+    raw_string_ostream OS(str);
+    MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
+    ASSERT_TRUE(OS.str() == "intrinsic(@llvm.bswap)");
+  }
+
+  str.clear();
+  // Set a target-specific intrinsic.
+  MO = MachineOperand::CreateIntrinsicID((Intrinsic::ID)-1);
+  {
+    // Print a MachineOperand containing a target-specific intrinsic ID but not
+    // IntrinsicInfo.
+    raw_string_ostream OS(str);
+    MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
+    ASSERT_TRUE(OS.str() == "intrinsic(4294967295)");
+  }
+}
+
+TEST(MachineOperandTest, PrintPredicate) {
+  // Create a MachineOperand with a generic intrinsic ID.
+  MachineOperand MO = MachineOperand::CreatePredicate(CmpInst::ICMP_EQ);
+
+  // Checking some preconditions on the newly created
+  // MachineOperand.
+  ASSERT_TRUE(MO.isPredicate());
+  ASSERT_TRUE(MO.getPredicate() == CmpInst::ICMP_EQ);
+
+  std::string str;
+  // Print a MachineOperand containing a int predicate ICMP_EQ.
+  raw_string_ostream OS(str);
+  MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
+  ASSERT_TRUE(OS.str() == "intpred(eq)");
+}
+
 } // end namespace
diff --git a/unittests/ExecutionEngine/Orc/CMakeLists.txt b/unittests/ExecutionEngine/Orc/CMakeLists.txt
index 28e07959ac7b..dd0281d0b73d 100644
--- a/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -24,4 +24,9 @@ add_llvm_unittest(OrcJITTests
   SymbolStringPoolTest.cpp
   )
 
-target_link_libraries(OrcJITTests PRIVATE ${LLVM_PTHREAD_LIB})
+set(ORC_JIT_TEST_LIBS ${LLVM_PTHREAD_LIB})
+if(NOT HAVE_CXX_ATOMICS64_WITHOUT_LIB)
+  list(APPEND ORC_JIT_TEST_LIBS atomic)
+endif()
+
+target_link_libraries(OrcJITTests PRIVATE ${ORC_JIT_TEST_LIBS})
diff --git a/unittests/Support/CachePruningTest.cpp b/unittests/Support/CachePruningTest.cpp
index 1bb57871925c..4bc2ad19ba43 100644
--- a/unittests/Support/CachePruningTest.cpp
+++ b/unittests/Support/CachePruningTest.cpp
@@ -27,10 +27,10 @@ TEST(CachePruningPolicyParser, Interval) {
   EXPECT_EQ(std::chrono::seconds(1), P->Interval);
   P = parseCachePruningPolicy("prune_interval=2m");
   ASSERT_TRUE(bool(P));
-  EXPECT_EQ(std::chrono::minutes(2), P->Interval);
+  EXPECT_EQ(std::chrono::minutes(2), *P->Interval);
   P = parseCachePruningPolicy("prune_interval=3h");
   ASSERT_TRUE(bool(P));
-  EXPECT_EQ(std::chrono::hours(3), P->Interval);
+  EXPECT_EQ(std::chrono::hours(3), *P->Interval);
 }
 
 TEST(CachePruningPolicyParser, Expiration) {
diff --git a/unittests/Support/MemoryBufferTest.cpp b/unittests/Support/MemoryBufferTest.cpp
index 294581aeb928..5e3c8db02791 100644
--- a/unittests/Support/MemoryBufferTest.cpp
+++ b/unittests/Support/MemoryBufferTest.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -103,15 +104,15 @@ TEST_F(MemoryBufferTest, copy) {
 
 TEST_F(MemoryBufferTest, make_new) {
   // 0-sized buffer
-  OwningBuffer Zero(MemoryBuffer::getNewUninitMemBuffer(0));
+  OwningBuffer Zero(WritableMemoryBuffer::getNewUninitMemBuffer(0));
   EXPECT_TRUE(nullptr != Zero.get());
 
   // uninitialized buffer with no name
-  OwningBuffer One(MemoryBuffer::getNewUninitMemBuffer(321));
+  OwningBuffer One(WritableMemoryBuffer::getNewUninitMemBuffer(321));
   EXPECT_TRUE(nullptr != One.get());
 
   // uninitialized buffer with name
-  OwningBuffer Two(MemoryBuffer::getNewUninitMemBuffer(123, "bla"));
+  OwningBuffer Two(WritableMemoryBuffer::getNewUninitMemBuffer(123, "bla"));
   EXPECT_TRUE(nullptr != Two.get());
 
   // 0-initialized buffer with no name
@@ -226,4 +227,37 @@ TEST_F(MemoryBufferTest, slice) {
   EXPECT_TRUE(BufData2.substr(0x1800,8).equals("abcdefgh"));
   EXPECT_TRUE(BufData2.substr(0x2FF8,8).equals("abcdefgh"));
 }
+
+TEST_F(MemoryBufferTest, writableSlice) {
+  // Create a file initialized with some data
+  int FD;
+  SmallString<64> TestPath;
+  sys::fs::createTemporaryFile("MemoryBufferTest_WritableSlice", "temp", FD,
+                               TestPath);
+  FileRemover Cleanup(TestPath);
+  raw_fd_ostream OF(FD, true);
+  for (unsigned i = 0; i < 0x1000; ++i)
+    OF << "0123456789abcdef";
+  OF.close();
+
+  {
+    auto MBOrError =
+        WritableMemoryBuffer::getFileSlice(TestPath.str(), 0x6000, 0x2000);
+    ASSERT_FALSE(MBOrError.getError());
+    // Write some data.  It should be mapped private, so that upon completion
+    // the original file contents are not modified.
+    WritableMemoryBuffer &MB = **MBOrError;
+    ASSERT_EQ(0x6000u, MB.getBufferSize());
+    char *Start = MB.getBufferStart();
+    ASSERT_EQ(MB.getBufferEnd(), MB.getBufferStart() + MB.getBufferSize());
+    ::memset(Start, 'x', MB.getBufferSize());
+  }
+
+  auto MBOrError = MemoryBuffer::getFile(TestPath);
+  ASSERT_FALSE(MBOrError.getError());
+  auto &MB = **MBOrError;
+  ASSERT_EQ(0x10000u, MB.getBufferSize());
+  for (size_t i = 0; i < MB.getBufferSize(); i += 0x10)
+    EXPECT_EQ("0123456789abcdef", MB.getBuffer().substr(i, 0x10)) << "i: " << i;
+}
 }
diff --git a/unittests/Support/TargetParserTest.cpp b/unittests/Support/TargetParserTest.cpp
index dcef40345f05..48fffca1aa18 100644
--- a/unittests/Support/TargetParserTest.cpp
+++ b/unittests/Support/TargetParserTest.cpp
@@ -278,6 +278,12 @@ TEST(TargetParserTest, testARMCPU) {
                          "7-S"));
 }
 
+TEST(TargetParserTest, testInvalidARMArch) {
+  auto InvalidArchStrings = {"armv", "armv99", "noarm"};
+  for (const char* InvalidArch : InvalidArchStrings)
+    EXPECT_EQ(ARM::parseArch(InvalidArch), ARM::ArchKind::INVALID);
+}
+
 bool testARMArch(StringRef Arch, StringRef DefaultCPU, StringRef SubArch,
                  unsigned ArchAttr) {
   ARM::ArchKind AK = ARM::parseArch(Arch);
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 9caff85a5963..914b22f0fcdf 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Endian.h"
@@ -2450,94 +2451,34 @@ TEST(YAMLIO, TestCustomMappingStruct) {
   EXPECT_EQ(4, y["bar"].bar);
 }
 
-TEST(YAMLIO, InvalidInput) {
-  // polluting 1 value in the sequence
-  Input yin("---\n- foo:  3\n  bar:  5\n1\n- foo:  3\n  bar:  5\n...\n");
-  std::vector<FooBar> Data;
-  yin >> Data;
-  EXPECT_TRUE((bool)yin.error());
-}
-
-TEST(YAMLIO, TestEscapedSingleQuote) {
-  std::string Id = "@abc@";
-
-  std::string out;
-  llvm::raw_string_ostream ostr(out);
-  Output xout(ostr, nullptr, 0);
-
-  llvm::yaml::EmptyContext Ctx;
-  yamlize(xout, Id, true, Ctx);
-
-  ostr.flush();
-  EXPECT_EQ("'@abc@'", out);
-}
-
-TEST(YAMLIO, TestEscapedNoQuote) {
-  std::string Id = "abc/";
-
-  std::string out;
-  llvm::raw_string_ostream ostr(out);
-  Output xout(ostr, nullptr, 0);
-
-  llvm::yaml::EmptyContext Ctx;
-  yamlize(xout, Id, true, Ctx);
-
-  ostr.flush();
-  EXPECT_EQ("abc/", out);
-}
-
-TEST(YAMLIO, TestEscapedDoubleQuoteNonPrintable) {
-  std::string Id = "\01@abc@";
-
-  std::string out;
-  llvm::raw_string_ostream ostr(out);
-  Output xout(ostr, nullptr, 0);
-
-  llvm::yaml::EmptyContext Ctx;
-  yamlize(xout, Id, true, Ctx);
-
-  ostr.flush();
-  EXPECT_EQ("\"\\x01@abc@\"", out);
-}
-
-TEST(YAMLIO, TestEscapedDoubleQuoteInsideSingleQuote) {
-  std::string Id = "abc\"fdf";
-
-  std::string out;
-  llvm::raw_string_ostream ostr(out);
-  Output xout(ostr, nullptr, 0);
-
-  llvm::yaml::EmptyContext Ctx;
-  yamlize(xout, Id, true, Ctx);
-
-  ostr.flush();
-  EXPECT_EQ("'abc\"fdf'", out);
-}
-
-TEST(YAMLIO, TestEscapedDoubleQuoteInsideDoubleQuote) {
-  std::string Id = "\01bc\"fdf";
-
-  std::string out;
-  llvm::raw_string_ostream ostr(out);
-  Output xout(ostr, nullptr, 0);
-
-  llvm::yaml::EmptyContext Ctx;
-  yamlize(xout, Id, true, Ctx);
-
-  ostr.flush();
-  EXPECT_EQ("\"\\x01bc\\\"fdf\"", out);
-}
-
-TEST(YAMLIO, TestEscapedSingleQuoteInsideSingleQuote) {
-  std::string Id = "abc'fdf";
-
+static void TestEscaped(llvm::StringRef Input, llvm::StringRef Expected) {
   std::string out;
   llvm::raw_string_ostream ostr(out);
   Output xout(ostr, nullptr, 0);
 
   llvm::yaml::EmptyContext Ctx;
-  yamlize(xout, Id, true, Ctx);
+  yamlize(xout, Input, true, Ctx);
 
   ostr.flush();
-  EXPECT_EQ("'abc''fdf'", out);
+  EXPECT_EQ(Expected, out);
+}
+
+TEST(YAMLIO, TestEscaped) {
+  // Single quote
+  TestEscaped("@abc@", "'@abc@'");
+  // No quote
+  TestEscaped("abc/", "abc/");
+  // Double quote non-printable
+  TestEscaped("\01@abc@", "\"\\x01@abc@\"");
+  // Double quote inside single quote
+  TestEscaped("abc\"fdf", "'abc\"fdf'");
+  // Double quote inside double quote
+  TestEscaped("\01bc\"fdf", "\"\\x01bc\\\"fdf\"");
+  // Single quote inside single quote
+  TestEscaped("abc'fdf", "'abc''fdf'");
+  // UTF8
+  TestEscaped("/*параметр*/", "\"/*параметр*/\"");
+  // UTF8 with single quote inside double quote
+  TestEscaped("parameter 'параметр' is unused",
+              "\"parameter 'параметр' is unused\"");
 }
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index c84f4925aa78..0944d54a4273 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -32,6 +32,7 @@ add_tablegen(llvm-tblgen LLVM
   PseudoLoweringEmitter.cpp
   RegisterBankEmitter.cpp
   RegisterInfoEmitter.cpp
+  SDNodeProperties.cpp
   SearchableTableEmitter.cpp
   SubtargetEmitter.cpp
   SubtargetFeatureInfo.cpp
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 51473f06da79..7755cd1be355 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -309,23 +309,23 @@ bool TypeSetByHwMode::intersect(SetType &Out, const SetType &In) {
   return Changed;
 }
 
-void TypeSetByHwMode::validate() const {
+bool TypeSetByHwMode::validate() const {
 #ifndef NDEBUG
   if (empty())
-    return;
+    return true;
   bool AllEmpty = true;
   for (const auto &I : *this)
     AllEmpty &= I.second.empty();
-  assert(!AllEmpty &&
-          "type set is empty for each HW mode: type contradiction?");
+  return !AllEmpty;
 #endif
+  return true;
 }
 
 // --- TypeInfer
 
 bool TypeInfer::MergeInTypeInfo(TypeSetByHwMode &Out,
                                 const TypeSetByHwMode &In) {
-  ValidateOnExit _1(Out);
+  ValidateOnExit _1(Out, *this);
   In.validate();
   if (In.empty() || Out == In || TP.hasError())
     return false;
@@ -342,7 +342,7 @@ bool TypeInfer::MergeInTypeInfo(TypeSetByHwMode &Out,
 }
 
 bool TypeInfer::forceArbitrary(TypeSetByHwMode &Out) {
-  ValidateOnExit _1(Out);
+  ValidateOnExit _1(Out, *this);
   if (TP.hasError())
     return false;
   assert(!Out.empty() && "cannot pick from an empty set");
@@ -361,7 +361,7 @@ bool TypeInfer::forceArbitrary(TypeSetByHwMode &Out) {
 }
 
 bool TypeInfer::EnforceInteger(TypeSetByHwMode &Out) {
-  ValidateOnExit _1(Out);
+  ValidateOnExit _1(Out, *this);
   if (TP.hasError())
     return false;
   if (!Out.empty())
@@ -371,7 +371,7 @@ bool TypeInfer::EnforceInteger(TypeSetByHwMode &Out) {
 }
 
 bool TypeInfer::EnforceFloatingPoint(TypeSetByHwMode &Out) {
-  ValidateOnExit _1(Out);
+  ValidateOnExit _1(Out, *this);
   if (TP.hasError())
     return false;
   if (!Out.empty())
@@ -381,7 +381,7 @@ bool TypeInfer::EnforceFloatingPoint(TypeSetByHwMode &Out) {
 }
 
 bool TypeInfer::EnforceScalar(TypeSetByHwMode &Out) {
-  ValidateOnExit _1(Out);
+  ValidateOnExit _1(Out, *this);
   if (TP.hasError())
     return false;
   if (!Out.empty())
@@ -391,7 +391,7 @@ bool TypeInfer::EnforceScalar(TypeSetByHwMode &Out) {
 }
 
 bool TypeInfer::EnforceVector(TypeSetByHwMode &Out) {
-  ValidateOnExit _1(Out);
+  ValidateOnExit _1(Out, *this);
   if (TP.hasError())
     return false;
   if (!Out.empty())
@@ -401,7 +401,7 @@ bool TypeInfer::EnforceVector(TypeSetByHwMode &Out) {
 }
 
 bool TypeInfer::EnforceAny(TypeSetByHwMode &Out) {
-  ValidateOnExit _1(Out);
+  ValidateOnExit _1(Out, *this);
   if (TP.hasError() || !Out.empty())
     return false;
 
@@ -440,7 +440,7 @@ static Iter max_if(Iter B, Iter E, Pred P, Less L) {
 /// Make sure that for each type in Small, there exists a larger type in Big.
 bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small,
                                    TypeSetByHwMode &Big) {
-  ValidateOnExit _1(Small), _2(Big);
+  ValidateOnExit _1(Small, *this), _2(Big, *this);
   if (TP.hasError())
     return false;
   bool Changed = false;
@@ -545,7 +545,7 @@ bool TypeInfer::EnforceSmallerThan(TypeSetByHwMode &Small,
 ///    type T in Vec, such that U is the element type of T.
 bool TypeInfer::EnforceVectorEltTypeIs(TypeSetByHwMode &Vec,
                                        TypeSetByHwMode &Elem) {
-  ValidateOnExit _1(Vec), _2(Elem);
+  ValidateOnExit _1(Vec, *this), _2(Elem, *this);
   if (TP.hasError())
     return false;
   bool Changed = false;
@@ -586,7 +586,7 @@ bool TypeInfer::EnforceVectorEltTypeIs(TypeSetByHwMode &Vec,
 bool TypeInfer::EnforceVectorEltTypeIs(TypeSetByHwMode &Vec,
                                        const ValueTypeByHwMode &VVT) {
   TypeSetByHwMode Tmp(VVT);
-  ValidateOnExit _1(Vec), _2(Tmp);
+  ValidateOnExit _1(Vec, *this), _2(Tmp, *this);
   return EnforceVectorEltTypeIs(Vec, Tmp);
 }
 
@@ -595,7 +595,7 @@ bool TypeInfer::EnforceVectorEltTypeIs(TypeSetByHwMode &Vec,
 /// element type as T and at least as many elements as T.
 bool TypeInfer::EnforceVectorSubVectorTypeIs(TypeSetByHwMode &Vec,
                                              TypeSetByHwMode &Sub) {
-  ValidateOnExit _1(Vec), _2(Sub);
+  ValidateOnExit _1(Vec, *this), _2(Sub, *this);
   if (TP.hasError())
     return false;
 
@@ -661,7 +661,7 @@ bool TypeInfer::EnforceVectorSubVectorTypeIs(TypeSetByHwMode &Vec,
 ///    type T in V, such that T and U have the same number of elements
 ///    (reverse of 2).
 bool TypeInfer::EnforceSameNumElts(TypeSetByHwMode &V, TypeSetByHwMode &W) {
-  ValidateOnExit _1(V), _2(W);
+  ValidateOnExit _1(V, *this), _2(W, *this);
   if (TP.hasError())
     return false;
 
@@ -699,7 +699,7 @@ bool TypeInfer::EnforceSameNumElts(TypeSetByHwMode &V, TypeSetByHwMode &W) {
 /// 2. Ensure that for each type U in B, there exists a type T in A
 ///    such that T and U have equal size in bits (reverse of 1).
 bool TypeInfer::EnforceSameSize(TypeSetByHwMode &A, TypeSetByHwMode &B) {
-  ValidateOnExit _1(A), _2(B);
+  ValidateOnExit _1(A, *this), _2(B, *this);
   if (TP.hasError())
     return false;
   bool Changed = false;
@@ -730,7 +730,7 @@ bool TypeInfer::EnforceSameSize(TypeSetByHwMode &A, TypeSetByHwMode &B) {
 }
 
 void TypeInfer::expandOverloads(TypeSetByHwMode &VTS) {
-  ValidateOnExit _1(VTS);
+  ValidateOnExit _1(VTS, *this);
   TypeSetByHwMode Legal = getLegalTypes();
   bool HaveLegalDef = Legal.hasDefault();
 
@@ -806,6 +806,19 @@ TypeSetByHwMode TypeInfer::getLegalTypes() {
   return VTS;
 }
 
+#ifndef NDEBUG
+TypeInfer::ValidateOnExit::~ValidateOnExit() {
+  if (!VTS.validate()) {
+    dbgs() << "Type set is empty for each HW mode:\n"
+              "possible type contradiction in the pattern below "
+              "(use -print-records with llvm-tblgen to see all "
+              "expanded records).\n";
+    Infer.TP.dump();
+    llvm_unreachable(nullptr);
+  }
+}
+#endif
+
 //===----------------------------------------------------------------------===//
 // TreePredicateFn Implementation
 //===----------------------------------------------------------------------===//
@@ -1591,37 +1604,7 @@ SDNodeInfo::SDNodeInfo(Record *R, const CodeGenHwModes &CGH) : Def(R) {
   NumOperands = TypeProfile->getValueAsInt("NumOperands");
 
   // Parse the properties.
-  Properties = 0;
-  for (Record *Property : R->getValueAsListOfDefs("Properties")) {
-    if (Property->getName() == "SDNPCommutative") {
-      Properties |= 1 << SDNPCommutative;
-    } else if (Property->getName() == "SDNPAssociative") {
-      Properties |= 1 << SDNPAssociative;
-    } else if (Property->getName() == "SDNPHasChain") {
-      Properties |= 1 << SDNPHasChain;
-    } else if (Property->getName() == "SDNPOutGlue") {
-      Properties |= 1 << SDNPOutGlue;
-    } else if (Property->getName() == "SDNPInGlue") {
-      Properties |= 1 << SDNPInGlue;
-    } else if (Property->getName() == "SDNPOptInGlue") {
-      Properties |= 1 << SDNPOptInGlue;
-    } else if (Property->getName() == "SDNPMayStore") {
-      Properties |= 1 << SDNPMayStore;
-    } else if (Property->getName() == "SDNPMayLoad") {
-      Properties |= 1 << SDNPMayLoad;
-    } else if (Property->getName() == "SDNPSideEffect") {
-      Properties |= 1 << SDNPSideEffect;
-    } else if (Property->getName() == "SDNPMemOperand") {
-      Properties |= 1 << SDNPMemOperand;
-    } else if (Property->getName() == "SDNPVariadic") {
-      Properties |= 1 << SDNPVariadic;
-    } else {
-      PrintFatalError("Unknown SD Node property '" +
-                      Property->getName() + "' on node '" +
-                      R->getName() + "'!");
-    }
-  }
-
+  Properties = parseSDPatternOperatorProperties(R);
 
   // Parse the type constraints.
   std::vector<Record*> ConstraintList =
@@ -2100,11 +2083,20 @@ bool TreePatternNode::NodeHasProperty(SDNP Property,
   if (isLeaf()) {
     if (const ComplexPattern *CP = getComplexPatternInfo(CGP))
       return CP->hasProperty(Property);
+
     return false;
   }
 
-  Record *Operator = getOperator();
-  if (!Operator->isSubClassOf("SDNode")) return false;
+  if (Property != SDNPHasChain) {
+    // The chain proprety is already present on the different intrinsic node
+    // types (intrinsic_w_chain, intrinsic_void), and is not explicitly listed
+    // on the intrinsic. Anything else is specific to the individual intrinsic.
+    if (const CodeGenIntrinsic *Int = getIntrinsicInfo(CGP))
+      return Int->hasProperty(Property);
+  }
+
+  if (!Operator->isSubClassOf("SDPatternOperator"))
+    return false;
 
   return CGP.getSDNodeInfo(Operator).hasProperty(Property);
 }
diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index afbcb10a4b66..8a8132c7f894 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h
@@ -18,6 +18,7 @@
 #include "CodeGenHwModes.h"
 #include "CodeGenIntrinsics.h"
 #include "CodeGenTarget.h"
+#include "SDNodeProperties.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSet.h"
@@ -234,7 +235,7 @@ struct TypeSetByHwMode : public InfoByHwMode<MachineValueTypeSet> {
   bool operator!=(const TypeSetByHwMode &VTS) const { return !(*this == VTS); }
 
   void dump() const;
-  void validate() const;
+  bool validate() const;
 
 private:
   /// Intersect two sets. Return true if anything has changed.
@@ -319,8 +320,13 @@ struct TypeInfer {
                        const TypeSetByHwMode::SetType &Legal);
 
   struct ValidateOnExit {
-    ValidateOnExit(TypeSetByHwMode &T) : VTS(T) {}
-    ~ValidateOnExit() { VTS.validate(); }
+    ValidateOnExit(TypeSetByHwMode &T, TypeInfer &TI) : Infer(TI), VTS(T) {}
+  #ifndef NDEBUG
+    ~ValidateOnExit();
+  #else
+    ~ValidateOnExit() {}  // Empty destructor with NDEBUG.
+  #endif
+    TypeInfer &Infer;
     TypeSetByHwMode &VTS;
   };
 
@@ -1204,6 +1210,7 @@ inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode *N,
       MadeChange |= TypeConstraints[i].ApplyTypeConstraint(N, *this, TP);
     return MadeChange;
   }
+
 } // end namespace llvm
 
 #endif
diff --git a/utils/TableGen/CodeGenIntrinsics.h b/utils/TableGen/CodeGenIntrinsics.h
index 24374127f536..91305034dc24 100644
--- a/utils/TableGen/CodeGenIntrinsics.h
+++ b/utils/TableGen/CodeGenIntrinsics.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_UTILS_TABLEGEN_CODEGENINTRINSICS_H
 #define LLVM_UTILS_TABLEGEN_CODEGENINTRINSICS_H
 
+#include "SDNodeProperties.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include <string>
 #include <vector>
@@ -104,6 +105,9 @@ struct CodeGenIntrinsic {
   };
   ModRefBehavior ModRef;
 
+  /// SDPatternOperator Properties applied to the intrinsic.
+  unsigned Properties;
+
   /// This is set to true if the intrinsic is overloaded by its argument
   /// types.
   bool isOverloaded;
@@ -133,6 +137,10 @@ struct CodeGenIntrinsic {
   enum ArgAttribute { NoCapture, Returned, ReadOnly, WriteOnly, ReadNone };
   std::vector<std::pair<unsigned, ArgAttribute>> ArgumentAttributes;
 
+  bool hasProperty(enum SDNP Prop) const {
+    return Properties & (1 << Prop);
+  }
+
   CodeGenIntrinsic(Record *R);
 };
 
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 827b6083c17f..168bd690831f 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTarget.h"
+#include "CodeGenDAGPatterns.h"
 #include "CodeGenIntrinsics.h"
 #include "CodeGenSchedule.h"
 #include "llvm/ADT/STLExtras.h"
@@ -450,6 +451,7 @@ ComplexPattern::ComplexPattern(Record *R) {
   else
     Complexity = RawComplexity;
 
+  // FIXME: Why is this different from parseSDPatternOperatorProperties?
   // Parse the properties.
   Properties = 0;
   std::vector<Record*> PropList = R->getValueAsListOfDefs("Properties");
@@ -512,6 +514,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
   TheDef = R;
   std::string DefName = R->getName();
   ModRef = ReadWriteMem;
+  Properties = 0;
   isOverloaded = false;
   isCommutative = false;
   canThrow = false;
@@ -681,6 +684,10 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
       llvm_unreachable("Unknown property!");
   }
 
+  // Also record the SDPatternOperator Properties.
+  Properties = parseSDPatternOperatorProperties(R);
+
   // Sort the argument attributes for later benefit.
   std::sort(ArgumentAttributes.begin(), ArgumentAttributes.end());
 }
+
diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index 89aa81b5fc33..7280d707fba6 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h
@@ -21,6 +21,7 @@
 #include "CodeGenInstruction.h"
 #include "CodeGenRegisters.h"
 #include "InfoByHwMode.h"
+#include "SDNodeProperties.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Record.h"
 #include <algorithm>
@@ -31,25 +32,6 @@ struct CodeGenRegister;
 class CodeGenSchedModels;
 class CodeGenTarget;
 
-// SelectionDAG node properties.
-//  SDNPMemOperand: indicates that a node touches memory and therefore must
-//                  have an associated memory operand that describes the access.
-enum SDNP {
-  SDNPCommutative,
-  SDNPAssociative,
-  SDNPHasChain,
-  SDNPOutGlue,
-  SDNPInGlue,
-  SDNPOptInGlue,
-  SDNPMayLoad,
-  SDNPMayStore,
-  SDNPSideEffect,
-  SDNPMemOperand,
-  SDNPVariadic,
-  SDNPWantRoot,
-  SDNPWantParent
-};
-
 /// getValueType - Return the MVT::SimpleValueType that the specified TableGen
 /// record corresponds to.
 MVT::SimpleValueType getValueType(Record *Rec);
diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp
index b80f02355062..c7d662db5a2f 100644
--- a/utils/TableGen/GlobalISelEmitter.cpp
+++ b/utils/TableGen/GlobalISelEmitter.cpp
@@ -574,6 +574,7 @@ class Matcher {
 public:
   virtual ~Matcher() = default;
   virtual void emit(MatchTable &Table) = 0;
+  virtual std::unique_ptr<PredicateMatcher> forgetFirstCondition() = 0;
 };
 
 class GroupMatcher : public Matcher {
@@ -595,6 +596,15 @@ public:
     Rules.clear();
   }
   void emit(MatchTable &Table) override;
+
+  std::unique_ptr<PredicateMatcher> forgetFirstCondition() override {
+    // We shouldn't need to mess up with groups, since we
+    // should have merged everything shareable upfront.
+    // If we start to look into reordering predicates,
+    // we may want to reconsider this.
+    assert(0 && "Groups should be formed maximal for now");
+    llvm_unreachable("No need for this for now");
+  }
 };
 
 /// Generates code to check that a match rule matches.
@@ -749,7 +759,7 @@ public:
   /// matcher.
   unsigned countRendererFns() const;
 
-  std::unique_ptr<PredicateMatcher> forgetFirstCondition();
+  std::unique_ptr<PredicateMatcher> forgetFirstCondition() override;
 
   // FIXME: Remove this as soon as possible
   InstructionMatcher &insnmatchers_front() const { return *Matchers.front(); }
@@ -921,7 +931,7 @@ class SameOperandMatcher : public OperandPredicateMatcher {
   std::string MatchingName;
 
 public:
-  SameOperandMatcher(StringRef MatchingName, unsigned InsnVarID, unsigned OpIdx)
+  SameOperandMatcher(unsigned InsnVarID, unsigned OpIdx, StringRef MatchingName)
       : OperandPredicateMatcher(OPM_SameOperand, InsnVarID, OpIdx),
         MatchingName(MatchingName) {}
 
@@ -941,7 +951,7 @@ protected:
 public:
   static std::set<LLTCodeGen> KnownTypes;
 
-  LLTOperandMatcher(const LLTCodeGen &Ty, unsigned InsnVarID, unsigned OpIdx)
+  LLTOperandMatcher(unsigned InsnVarID, unsigned OpIdx, const LLTCodeGen &Ty)
       : OperandPredicateMatcher(OPM_LLT, InsnVarID, OpIdx), Ty(Ty) {
     KnownTypes.insert(Ty);
   }
@@ -981,8 +991,8 @@ protected:
   unsigned SizeInBits;
 
 public:
-  PointerToAnyOperandMatcher(unsigned SizeInBits, unsigned InsnVarID,
-                             unsigned OpIdx)
+  PointerToAnyOperandMatcher(unsigned InsnVarID, unsigned OpIdx,
+                             unsigned SizeInBits)
       : OperandPredicateMatcher(OPM_PointerToAny, InsnVarID, OpIdx),
         SizeInBits(SizeInBits) {}
 
@@ -1011,9 +1021,9 @@ protected:
 public:
   bool isIdentical(const PredicateMatcher &B) const override { return false; }
 
-  ComplexPatternOperandMatcher(const OperandMatcher &Operand,
-                               const Record &TheDef, unsigned InsnVarID,
-                               unsigned OpIdx)
+  ComplexPatternOperandMatcher(unsigned InsnVarID, unsigned OpIdx,
+                               const OperandMatcher &Operand,
+                               const Record &TheDef)
       : OperandPredicateMatcher(OPM_ComplexPattern, InsnVarID, OpIdx),
         Operand(Operand), TheDef(TheDef) {}
 
@@ -1043,8 +1053,8 @@ protected:
   const CodeGenRegisterClass &RC;
 
 public:
-  RegisterBankOperandMatcher(const CodeGenRegisterClass &RC, unsigned InsnVarID,
-                             unsigned OpIdx)
+  RegisterBankOperandMatcher(unsigned InsnVarID, unsigned OpIdx,
+                             const CodeGenRegisterClass &RC)
       : OperandPredicateMatcher(OPM_RegBank, InsnVarID, OpIdx), RC(RC) {}
 
   bool isIdentical(const PredicateMatcher &B) const override {
@@ -1092,7 +1102,7 @@ protected:
   int64_t Value;
 
 public:
-  ConstantIntOperandMatcher(int64_t Value, unsigned InsnVarID, unsigned OpIdx)
+  ConstantIntOperandMatcher(unsigned InsnVarID, unsigned OpIdx, int64_t Value)
       : OperandPredicateMatcher(OPM_Int, InsnVarID, OpIdx), Value(Value) {}
 
   bool isIdentical(const PredicateMatcher &B) const override {
@@ -1120,7 +1130,7 @@ protected:
   int64_t Value;
 
 public:
-  LiteralIntOperandMatcher(int64_t Value, unsigned InsnVarID, unsigned OpIdx)
+  LiteralIntOperandMatcher(unsigned InsnVarID, unsigned OpIdx, int64_t Value)
       : OperandPredicateMatcher(OPM_LiteralInt, InsnVarID, OpIdx),
         Value(Value) {}
 
@@ -1148,8 +1158,8 @@ protected:
   const CodeGenIntrinsic *II;
 
 public:
-  IntrinsicIDOperandMatcher(const CodeGenIntrinsic *II, unsigned InsnVarID,
-                            unsigned OpIdx)
+  IntrinsicIDOperandMatcher(unsigned InsnVarID, unsigned OpIdx,
+                            const CodeGenIntrinsic *II)
       : OperandPredicateMatcher(OPM_IntrinsicID, InsnVarID, OpIdx), II(II) {}
 
   bool isIdentical(const PredicateMatcher &B) const override {
@@ -1284,9 +1294,9 @@ PredicateListMatcher<OperandPredicateMatcher>::addPredicate(Args &&... args) {
   auto *OpMatcher = static_cast<OperandMatcher *>(this);
   if (static_cast<OperandMatcher *>(this)->isSameAsAnotherOperand())
     return None;
-  Predicates.emplace_back(llvm::make_unique<Kind>(
-      std::forward<Args>(args)..., OpMatcher->getInsnVarID(),
-      OpMatcher->getOperandIndex()));
+  Predicates.emplace_back(llvm::make_unique<Kind>(OpMatcher->getInsnVarID(),
+                                                  OpMatcher->getOperandIndex(),
+                                                  std::forward<Args>(args)...));
   return static_cast<Kind *>(Predicates.back().get());
 }
 
@@ -1658,8 +1668,8 @@ protected:
   std::unique_ptr<InstructionMatcher> InsnMatcher;
 
 public:
-  InstructionOperandMatcher(RuleMatcher &Rule, StringRef SymbolicName,
-                            unsigned InsnVarID, unsigned OpIdx)
+  InstructionOperandMatcher(unsigned InsnVarID, unsigned OpIdx,
+                            RuleMatcher &Rule, StringRef SymbolicName)
       : OperandPredicateMatcher(OPM_Instruction, InsnVarID, OpIdx),
         InsnMatcher(new InstructionMatcher(Rule, SymbolicName)) {}
 
@@ -2336,6 +2346,10 @@ void RuleMatcher::emit(MatchTable &Table) {
   if (Matchers.empty())
     llvm_unreachable("Unexpected empty matcher!");
 
+  // Reset the ID generation so that the emitted IDs match the ones
+  // we set while building the InstructionMatcher and such.
+  clearImplicitMap();
+
   // The representation supports rules that require multiple roots such as:
   //    %ptr(p0) = ...
   //    %elt0(s32) = G_LOAD %ptr
@@ -2615,7 +2629,7 @@ private:
   ///   # predicate C
   /// \endverbatim
   std::vector<Matcher *> optimizeRules(
-      std::vector<RuleMatcher> &Rules,
+      const std::vector<Matcher *> &Rules,
       std::vector<std::unique_ptr<GroupMatcher>> &StorageGroupMatcher);
 };
 
@@ -3342,9 +3356,6 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
   unsigned TempOpIdx = 0;
   auto InsnMatcherOrError =
       createAndImportSelDAGMatcher(M, InsnMatcherTemp, Src, TempOpIdx);
-  // Reset the ID generation so that the emitted IDs match the ones
-  // in the InstructionMatcher and such.
-  M.clearImplicitMap();
   if (auto Error = InsnMatcherOrError.takeError())
     return std::move(Error);
   InstructionMatcher &InsnMatcher = InsnMatcherOrError.get();
@@ -3539,28 +3550,36 @@ void GlobalISelEmitter::emitImmPredicates(
     OS << "};\n";
   }
 
-  for (const auto *Record : MatchedRecords)
-    OS << "static bool Predicate_" << Record->getName() << "(" << Type
-       << " Imm) {" << Record->getValueAsString("ImmediateCode") << "}\n";
-
-  OS << "static InstructionSelector::" << TypeIdentifier
-     << "ImmediatePredicateFn " << TypeIdentifier << "ImmPredicateFns[] = {\n"
-     << "  nullptr,\n";
-  for (const auto *Record : MatchedRecords)
-    OS << "  Predicate_" << Record->getName() << ",\n";
-  OS << "};\n";
+  OS << "bool " << Target.getName() << "InstructionSelector::testImmPredicate_"
+     << TypeIdentifier << "(unsigned PredicateID, " << Type
+     << " Imm) const {\n";
+  if (!MatchedRecords.empty())
+    OS << "  switch (PredicateID) {\n";
+  for (const auto *Record : MatchedRecords) {
+    OS << "  case GIPFP_" << TypeIdentifier << "_Predicate_"
+       << Record->getName() << ": {\n"
+       << "    " << Record->getValueAsString("ImmediateCode") << "\n"
+       << "    llvm_unreachable(\"ImmediateCode should have returned\");\n"
+       << "    return false;\n"
+       << "  }\n";
+  }
+  if (!MatchedRecords.empty())
+    OS << "  }\n";
+  OS << "  llvm_unreachable(\"Unknown predicate\");\n"
+     << "  return false;\n"
+     << "}\n";
 }
 
 std::vector<Matcher *> GlobalISelEmitter::optimizeRules(
-    std::vector<RuleMatcher> &Rules,
+    const std::vector<Matcher *> &Rules,
     std::vector<std::unique_ptr<GroupMatcher>> &StorageGroupMatcher) {
   std::vector<Matcher *> OptRules;
   // Start with a stupid grouping for now.
   std::unique_ptr<GroupMatcher> CurrentGroup = make_unique<GroupMatcher>();
   assert(CurrentGroup->conditions_empty());
   unsigned NbGroup = 0;
-  for (RuleMatcher &Rule : Rules) {
-    std::unique_ptr<PredicateMatcher> Predicate = Rule.forgetFirstCondition();
+  for (Matcher *Rule : Rules) {
+    std::unique_ptr<PredicateMatcher> Predicate = Rule->forgetFirstCondition();
     if (!CurrentGroup->conditions_empty() &&
         !CurrentGroup->lastConditionMatches(*Predicate)) {
       // Start a new group.
@@ -3572,7 +3591,7 @@ std::vector<Matcher *> GlobalISelEmitter::optimizeRules(
     }
     if (CurrentGroup->conditions_empty())
       CurrentGroup->addCondition(std::move(Predicate));
-    CurrentGroup->addRule(Rule);
+    CurrentGroup->addRule(*Rule);
   }
   if (!CurrentGroup->conditions_empty()) {
     ++NbGroup;
@@ -3662,12 +3681,17 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
         "MatcherInfo;\n"
      << "  static " << Target.getName()
      << "InstructionSelector::ComplexMatcherMemFn ComplexPredicateFns[];\n"
+     << "bool testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const "
+        "override;\n"
+     << "bool testImmPredicate_APInt(unsigned PredicateID, const APInt &Imm) "
+        "const override;\n"
+     << "bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat "
+        "&Imm) const override;\n"
      << "#endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL\n\n";
 
   OS << "#ifdef GET_GLOBALISEL_TEMPORARIES_INIT\n"
      << ", State(" << MaxTemporaries << "),\n"
-     << "MatcherInfo({TypeObjects, FeatureBitsets, I64ImmPredicateFns, "
-        "APIntImmPredicateFns, APFloatImmPredicateFns, ComplexPredicateFns})\n"
+     << "MatcherInfo({TypeObjects, FeatureBitsets, ComplexPredicateFns})\n"
      << "#endif // ifdef GET_GLOBALISEL_TEMPORARIES_INIT\n\n";
 
   OS << "#ifdef GET_GLOBALISEL_IMPL\n";
@@ -3823,12 +3847,13 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
   });
   std::vector<std::unique_ptr<GroupMatcher>> StorageGroupMatcher;
 
-  std::vector<Matcher *> OptRules;
-  if (OptimizeMatchTable)
-    OptRules = optimizeRules(Rules, StorageGroupMatcher);
-  else
-    for (Matcher &Rule : Rules)
-      OptRules.push_back(&Rule);
+  std::vector<Matcher *> InputRules;
+  for (Matcher &Rule : Rules)
+    InputRules.push_back(&Rule);
+
+  std::vector<Matcher *> OptRules =
+      OptimizeMatchTable ? optimizeRules(InputRules, StorageGroupMatcher)
+                         : InputRules;
 
   MatchTable Table(0);
   for (Matcher *Rule : OptRules) {
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index b4e61ec53c19..37e024b1665e 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -717,6 +717,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
         if (addComma)
           OS << ",";
         OS << "Attribute::InaccessibleMemOrArgMemOnly";
+        break;
       case CodeGenIntrinsic::ReadWriteMem:
         break;
       }
diff --git a/utils/TableGen/SDNodeProperties.cpp b/utils/TableGen/SDNodeProperties.cpp
new file mode 100644
index 000000000000..343febc99d1e
--- /dev/null
+++ b/utils/TableGen/SDNodeProperties.cpp
@@ -0,0 +1,49 @@
+//===- SDNodeProperties.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SDNodeProperties.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace llvm;
+
+unsigned llvm::parseSDPatternOperatorProperties(Record *R) {
+  unsigned Properties = 0;
+  for (Record *Property : R->getValueAsListOfDefs("Properties")) {
+    if (Property->getName() == "SDNPCommutative") {
+      Properties |= 1 << SDNPCommutative;
+    } else if (Property->getName() == "SDNPAssociative") {
+      Properties |= 1 << SDNPAssociative;
+    } else if (Property->getName() == "SDNPHasChain") {
+      Properties |= 1 << SDNPHasChain;
+    } else if (Property->getName() == "SDNPOutGlue") {
+      Properties |= 1 << SDNPOutGlue;
+    } else if (Property->getName() == "SDNPInGlue") {
+      Properties |= 1 << SDNPInGlue;
+    } else if (Property->getName() == "SDNPOptInGlue") {
+      Properties |= 1 << SDNPOptInGlue;
+    } else if (Property->getName() == "SDNPMayStore") {
+      Properties |= 1 << SDNPMayStore;
+    } else if (Property->getName() == "SDNPMayLoad") {
+      Properties |= 1 << SDNPMayLoad;
+    } else if (Property->getName() == "SDNPSideEffect") {
+      Properties |= 1 << SDNPSideEffect;
+    } else if (Property->getName() == "SDNPMemOperand") {
+      Properties |= 1 << SDNPMemOperand;
+    } else if (Property->getName() == "SDNPVariadic") {
+      Properties |= 1 << SDNPVariadic;
+    } else {
+      PrintFatalError("Unknown SD Node property '" +
+                      Property->getName() + "' on node '" +
+                      R->getName() + "'!");
+    }
+  }
+
+  return Properties;
+}
diff --git a/utils/TableGen/SDNodeProperties.h b/utils/TableGen/SDNodeProperties.h
new file mode 100644
index 000000000000..a8d4efb5dab0
--- /dev/null
+++ b/utils/TableGen/SDNodeProperties.h
@@ -0,0 +1,40 @@
+//===- SDNodeProperties.h ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_SDNODEPROPERTIES_H
+#define LLVM_UTILS_TABLEGEN_SDNODEPROPERTIES_H
+
+namespace llvm {
+
+class Record;
+
+// SelectionDAG node properties.
+//  SDNPMemOperand: indicates that a node touches memory and therefore must
+//                  have an associated memory operand that describes the access.
+enum SDNP {
+  SDNPCommutative,
+  SDNPAssociative,
+  SDNPHasChain,
+  SDNPOutGlue,
+  SDNPInGlue,
+  SDNPOptInGlue,
+  SDNPMayLoad,
+  SDNPMayStore,
+  SDNPSideEffect,
+  SDNPMemOperand,
+  SDNPVariadic,
+  SDNPWantRoot,
+  SDNPWantParent
+};
+
+unsigned parseSDPatternOperatorProperties(Record *R);
+
+}
+
+#endif
diff --git a/utils/docker/build_docker_image.sh b/utils/docker/build_docker_image.sh
index ad7831925f9a..9b0ba46fe4b3 100755
--- a/utils/docker/build_docker_image.sh
+++ b/utils/docker/build_docker_image.sh
@@ -30,6 +30,10 @@ Available options:
                         'branches/release_40'
                         (default: 'trunk')
     -r|--revision       svn revision to checkout
+    -c|--cherrypick     revision to cherry-pick. Can be specified multiple times.
+                        Cherry-picks are performed in the sorted order using the
+                        following command:
+                        'svn patch <(svn diff -c \$rev)'.
     -p|--llvm-project   name of an svn project to checkout. Will also add the
                         project to a list LLVM_ENABLE_PROJECTS, passed to CMake.
                         For clang, please use 'clang', not 'cfe'.
@@ -92,7 +96,7 @@ while [[ $# -gt 0 ]]; do
       DOCKER_TAG="$1"
       shift
       ;;
-    -i|--install-target|-r|--revision|-b|--branch|-p|--llvm-project)
+    -i|--install-target|-r|--revision|-c|-cherrypick|-b|--branch|-p|--llvm-project)
       if [ "$1" == "-i" ] || [ "$1" == "--install-target" ]; then
         SEEN_INSTALL_TARGET=1
       fi
diff --git a/utils/docker/scripts/build_install_llvm.sh b/utils/docker/scripts/build_install_llvm.sh
index 79ce7e50efdc..5141fdc9bb5e 100755
--- a/utils/docker/scripts/build_install_llvm.sh
+++ b/utils/docker/scripts/build_install_llvm.sh
@@ -25,6 +25,10 @@ Available options:
                       'branches/release_40'
                       (default: 'trunk')
   -r|--revision       svn revision to checkout
+  -c|--cherrypick     revision to cherry-pick. Can be specified multiple times.
+                      Cherry-picks are performed in the sorted order using the
+                      following command:
+                      'svn patch <(svn diff -c \$rev)'.
   -p|--llvm-project   name of an svn project to checkout. Will also add the
                       project to a list LLVM_ENABLE_PROJECTS, passed to CMake.
                       For clang, please use 'clang', not 'cfe'.
@@ -40,6 +44,7 @@ EOF
 }
 
 LLVM_SVN_REV=""
+CHERRYPICKS=""
 LLVM_BRANCH=""
 CMAKE_ARGS=""
 CMAKE_INSTALL_TARGETS=""
@@ -77,6 +82,11 @@ while [[ $# -gt 0 ]]; do
       LLVM_SVN_REV="$1"
       shift
       ;;
+    -c|--cherrypick)
+      shift
+      CHERRYPICKS="$CHERRYPICKS $1"
+      shift
+      ;;
     -b|--branch)
       shift
       LLVM_BRANCH="$1"
@@ -153,6 +163,28 @@ else
   echo "Checking out latest svn revision."
 fi
 
+# Sort cherrypicks and remove duplicates.
+CHERRYPICKS="$(echo "$CHERRYPICKS" | xargs -n1 | sort | uniq | xargs)"
+
+function apply_cherrypicks() {
+  local CHECKOUT_DIR="$1"
+
+  [ "$CHERRYPICKS" == "" ] || echo "Applying cherrypicks"
+  pushd "$CHECKOUT_DIR"
+
+  # This function is always called on a sorted list of cherrypicks.
+  for CHERRY_REV in $CHERRYPICKS; do
+    echo "Cherry-picking r$CHERRY_REV into $CHECKOUT_DIR"
+
+    local PATCH_FILE="$(mktemp)"
+    svn diff -c $CHERRY_REV > "$PATCH_FILE"
+    svn patch "$PATCH_FILE"
+    rm "$PATCH_FILE"
+  done
+
+  popd
+}
+
 CLANG_BUILD_DIR=/tmp/clang-build
 CLANG_INSTALL_DIR=/tmp/clang-install
 
@@ -172,6 +204,11 @@ for LLVM_PROJECT in $LLVM_PROJECTS; do
   svn co -q $SVN_REV_ARG \
     "https://llvm.org/svn/llvm-project/$SVN_PROJECT/$LLVM_BRANCH" \
     "$CLANG_BUILD_DIR/src/$LLVM_PROJECT"
+
+  # We apply cherrypicks to all repositories regardless of whether the revision
+  # changes this repository or not. For repositories not affected by the
+  # cherrypick, applying the cherrypick is a no-op.
+  apply_cherrypicks "$CLANG_BUILD_DIR/src/$LLVM_PROJECT"
 done
 
 if [ $CLANG_TOOLS_EXTRA_ENABLED -ne 0 ]; then
@@ -179,6 +216,8 @@ if [ $CLANG_TOOLS_EXTRA_ENABLED -ne 0 ]; then
   svn co -q $SVN_REV_ARG \
     "https://llvm.org/svn/llvm-project/clang-tools-extra/$LLVM_BRANCH" \
     "$CLANG_BUILD_DIR/src/clang/tools/extra"
+
+  apply_cherrypicks "$CLANG_BUILD_DIR/src/clang/tools/extra"
 fi
 
 CHECKSUMS_FILE="/tmp/checksums/checksums.txt"
diff --git a/utils/git-svn/git-llvm b/utils/git-svn/git-llvm
index 0d566dac430a..5d9d4d291006 100755
--- a/utils/git-svn/git-llvm
+++ b/utils/git-svn/git-llvm
@@ -178,7 +178,7 @@ def clean_and_update_svn(svn_repo):
 
     # Unfortunately it appears there's no svn equivalent for git clean, so we
     # have to do it ourselves.
-    for line in svn(svn_repo, 'status').split('\n'):
+    for line in svn(svn_repo, 'status', '--no-ignore').split('\n'):
         if not line.startswith('?'):
             continue
         filename = line[1:].strip()
@@ -252,7 +252,7 @@ def svn_push_one_rev(svn_repo, rev, dry_run):
     if not subrepos:
         raise RuntimeError('Empty diff for rev %s?' % rev)
 
-    status = svn(svn_repo, 'status')
+    status = svn(svn_repo, 'status', '--no-ignore')
     if status:
         die("Can't push git rev %s because svn status is not empty:\n%s" %
             (rev, status))
@@ -272,10 +272,11 @@ def svn_push_one_rev(svn_repo, rev, dry_run):
                    "first?")
             sys.exit(2)
 
-    status_lines = svn(svn_repo, 'status').split('\n')
+    status_lines = svn(svn_repo, 'status', '--no-ignore').split('\n')
 
-    for l in (l for l in status_lines if l.startswith('?')):
-        svn(svn_repo, 'add', l[1:].strip())
+    for l in (l for l in status_lines if (l.startswith('?') or
+                                          l.startswith('I'))):
+        svn(svn_repo, 'add', '--no-ignore', l[1:].strip())
     for l in (l for l in status_lines if l.startswith('!')):
         svn(svn_repo, 'remove', l[1:].strip())
 
diff --git a/utils/update_mir_test_checks.py b/utils/update_mir_test_checks.py
index 2934f09f6b37..3756af1b517d 100755
--- a/utils/update_mir_test_checks.py
+++ b/utils/update_mir_test_checks.py
@@ -33,16 +33,20 @@ TRIPLE_IR_RE = re.compile(r'^\s*target\s+triple\s*=\s*"([^"]+)"$')
 CHECK_PREFIX_RE = re.compile('--?check-prefix(?:es)?[= ](\S+)')
 CHECK_RE = re.compile(r'^\s*[;#]\s*([^:]+?)(?:-NEXT|-NOT|-DAG|-LABEL)?:')
 
-FUNC_NAME_RE = re.compile(r' *name: *(?P<func>[A-Za-z0-9_.-]+)')
-BODY_BEGIN_RE = re.compile(r' *body: *\|')
-BASIC_BLOCK_RE = re.compile(r' *bb\.[0-9]+.*:$')
+MIR_FUNC_NAME_RE = re.compile(r' *name: *(?P<func>[A-Za-z0-9_.-]+)')
+MIR_BODY_BEGIN_RE = re.compile(r' *body: *\|')
+MIR_BASIC_BLOCK_RE = re.compile(r' *bb\.[0-9]+.*:$')
 VREG_RE = re.compile(r'(%[0-9]+)(?::[a-z0-9_]+)?(?:\([<>a-z0-9 ]+\))?')
 VREG_DEF_RE = re.compile(
     r'^ *(?P<vregs>{0}(?:, {0})*) '
     r'= (?P<opcode>[A-Zt][A-Za-z0-9_]+)'.format(VREG_RE.pattern))
-PREFIX_DATA_RE = re.compile(r'^ *(;|bb.[0-9].*: *$|[a-z]+:( |$)|$)')
+MIR_PREFIX_DATA_RE = re.compile(r'^ *(;|bb.[0-9].*: *$|[a-z]+:( |$)|$)')
 VREG_CLASS_RE = re.compile(r'^ *- *{ id: ([0-9]+), class: ([a-z0-9_]+)', re.M)
 
+IR_FUNC_NAME_RE = re.compile(
+    r'^\s*define\s+(?:internal\s+)?[^@]*@(?P<func>\w+)\s*\(')
+IR_PREFIX_DATA_RE = re.compile(r'^ *(;|$)')
+
 MIR_FUNC_RE = re.compile(
     r'^---$'
     r'\n'
@@ -164,13 +168,13 @@ def find_functions_with_one_bb(lines, verbose=False):
     cur_func = None
     bbs = 0
     for line in lines:
-        m = FUNC_NAME_RE.match(line)
+        m = MIR_FUNC_NAME_RE.match(line)
         if m:
             if bbs == 1:
                 result.append(cur_func)
             cur_func = m.group('func')
             bbs = 0
-        m = BASIC_BLOCK_RE.match(line)
+        m = MIR_BASIC_BLOCK_RE.match(line)
         if m:
             bbs += 1
     if bbs == 1:
@@ -340,8 +344,10 @@ def update_test_file(llc, test, remove_common_prefixes=False,
         warn('Ignoring common prefixes: {}'.format(common_prefixes),
              test_file=test)
 
-    autogenerated_note = ('# NOTE: Assertions have been autogenerated by '
-                          'utils/{}'.format(os.path.basename(__file__)))
+    comment_char = '#' if test.endswith('.mir') else ';'
+    autogenerated_note = ('{} NOTE: Assertions have been autogenerated by '
+                          'utils/{}'.format(comment_char,
+                                            os.path.basename(__file__)))
     output_lines = []
     output_lines.append(autogenerated_note)
 
@@ -350,48 +356,69 @@ def update_test_file(llc, test, remove_common_prefixes=False,
             continue
 
         if state == 'toplevel':
+            m = IR_FUNC_NAME_RE.match(input_line)
+            if m:
+                state = 'ir function prefix'
+                func_name = m.group('func')
             if input_line.strip() == '---':
                 state = 'document'
             output_lines.append(input_line)
         elif state == 'document':
-            m = FUNC_NAME_RE.match(input_line)
+            m = MIR_FUNC_NAME_RE.match(input_line)
             if m:
-                state = 'function metadata'
+                state = 'mir function metadata'
                 func_name = m.group('func')
             if input_line.strip() == '...':
                 state = 'toplevel'
                 func_name = None
             if should_add_line_to_output(input_line, prefix_set):
                 output_lines.append(input_line)
-        elif state == 'function metadata':
+        elif state == 'mir function metadata':
             if should_add_line_to_output(input_line, prefix_set):
                 output_lines.append(input_line)
-            m = BODY_BEGIN_RE.match(input_line)
+            m = MIR_BODY_BEGIN_RE.match(input_line)
             if m:
                 if func_name in simple_functions:
                     # If there's only one block, put the checks inside it
-                    state = 'function prefix'
+                    state = 'mir function prefix'
                     continue
-                state = 'function body'
+                state = 'mir function body'
                 add_checks_for_function(test, output_lines, run_list,
                                         func_dict, func_name, add_vreg_checks,
                                         single_bb=False, verbose=verbose)
-        elif state == 'function prefix':
-            m = PREFIX_DATA_RE.match(input_line)
+        elif state == 'mir function prefix':
+            m = MIR_PREFIX_DATA_RE.match(input_line)
             if not m:
-                state = 'function body'
+                state = 'mir function body'
                 add_checks_for_function(test, output_lines, run_list,
                                         func_dict, func_name, add_vreg_checks,
                                         single_bb=True, verbose=verbose)
 
             if should_add_line_to_output(input_line, prefix_set):
                 output_lines.append(input_line)
-        elif state == 'function body':
+        elif state == 'mir function body':
             if input_line.strip() == '...':
                 state = 'toplevel'
                 func_name = None
             if should_add_line_to_output(input_line, prefix_set):
                 output_lines.append(input_line)
+        elif state == 'ir function prefix':
+            m = IR_PREFIX_DATA_RE.match(input_line)
+            if not m:
+                state = 'ir function body'
+                add_checks_for_function(test, output_lines, run_list,
+                                        func_dict, func_name, add_vreg_checks,
+                                        single_bb=False, verbose=verbose)
+
+            if should_add_line_to_output(input_line, prefix_set):
+                output_lines.append(input_line)
+        elif state == 'ir function body':
+            if input_line.strip() == '}':
+                state = 'toplevel'
+                func_name = None
+            if should_add_line_to_output(input_line, prefix_set):
+                output_lines.append(input_line)
+
 
     log('Writing {} lines to {}...'.format(len(output_lines), test), verbose)