From 6694ed095d6b27a2c92ec4fd63664fcd88a05749 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Fri, 6 Jan 2017 20:13:35 +0000
Subject: Vendor import of clang trunk r291274:
 https://llvm.org/svn/llvm-project/cfe/trunk@291274

---
 examples/clang-interpreter/main.cpp                |   2 +-
 include/clang/AST/DeclCXX.h                        |   2 +-
 include/clang/ASTMatchers/Dynamic/VariantValue.h   |   7 +-
 include/clang/Basic/Attr.td                        |  26 +-
 include/clang/Basic/BuiltinsPPC.def                |   3 +
 include/clang/Basic/DiagnosticSemaKinds.td         |   6 +-
 include/clang/CodeGen/BackendUtil.h                |   4 +-
 include/clang/Driver/ToolChain.h                   |   7 +
 include/clang/Frontend/ASTUnit.h                   |  34 +-
 include/clang/Frontend/CompilerInstance.h          |  18 +-
 include/clang/Frontend/CompilerInvocation.h        |  12 +-
 include/clang/Frontend/FrontendOptions.h           |   2 +-
 include/clang/Frontend/Utils.h                     |   6 +-
 include/clang/Lex/HeaderSearch.h                   |   4 +-
 include/clang/Lex/HeaderSearchOptions.h            |   2 +-
 include/clang/Lex/Preprocessor.h                   |  11 +-
 include/clang/Lex/PreprocessorOptions.h            |   6 +-
 include/clang/Sema/CodeCompleteConsumer.h          |  17 +-
 include/clang/Sema/Ownership.h                     |   8 +-
 include/clang/Sema/Sema.h                          |  15 +-
 include/clang/Serialization/ASTReader.h            |  62 ++--
 include/clang/Serialization/ASTWriter.h            |  99 +++---
 include/clang/Serialization/ModuleFileExtension.h  |   2 +-
 .../StaticAnalyzer/Core/BugReporter/BugReporter.h  |   7 +-
 .../Core/BugReporter/BugReporterVisitor.h          | 136 ++++----
 .../Core/BugReporter/PathDiagnostic.h              |  29 +-
 include/clang/StaticAnalyzer/Core/CheckerManager.h |   8 +-
 include/clang/Tooling/Tooling.h                    |   8 +-
 lib/ARCMigrate/ARCMT.cpp                           |   4 +-
 lib/AST/ASTContext.cpp                             |   4 +-
 lib/ASTMatchers/Dynamic/VariantValue.cpp           |   8 +-
 lib/Basic/Targets.cpp                              | 181 +++++++++--
 lib/CodeGen/BackendUtil.cpp                        |  30 +-
 lib/CodeGen/CGBuiltin.cpp                          |  84 +++++
 lib/CodeGen/CGCall.cpp                             |   6 +-
 lib/CodeGen/CGExpr.cpp                             |  11 +-
 lib/CodeGen/CGOpenMPRuntime.cpp                    |  23 +-
 lib/CodeGen/CGOpenMPRuntime.h                      |  36 ++-
 lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp               | 270 ++++++++--------
 lib/CodeGen/CGOpenMPRuntimeNVPTX.h                 |  51 +--
 lib/CodeGen/CodeGenAction.cpp                      |  16 +-
 lib/CodeGen/CodeGenFunction.h                      |   2 +-
 lib/CodeGen/ObjectFilePCHContainerOperations.cpp   |   9 +-
 lib/CodeGen/TargetInfo.cpp                         | 206 ++++++++++--
 lib/Driver/Driver.cpp                              |   3 +
 lib/Driver/MSVCToolChain.cpp                       |  13 +-
 lib/Driver/MinGWToolChain.cpp                      |  17 +-
 lib/Driver/ToolChains.cpp                          |  37 ++-
 lib/Driver/ToolChains.h                            |  34 +-
 lib/Driver/Tools.cpp                               |  40 ++-
 lib/Driver/Tools.h                                 |  13 +
 lib/Frontend/ASTUnit.cpp                           |  85 +++--
 lib/Frontend/ChainedIncludesSource.cpp             |   4 +-
 lib/Frontend/CompilerInstance.cpp                  |  38 ++-
 lib/Frontend/CompilerInvocation.cpp                |  15 +-
 lib/Frontend/CreateInvocationFromCommandLine.cpp   |  10 +-
 lib/Frontend/FrontendAction.cpp                    |   2 +-
 lib/Frontend/SerializedDiagnosticPrinter.cpp       |  40 +--
 lib/Frontend/TestModuleFileExtension.cpp           |   4 +-
 lib/Headers/__clang_cuda_cmath.h                   |  10 +-
 lib/Headers/__clang_cuda_intrinsics.h              |  42 +--
 lib/Headers/altivec.h                              |   3 +
 lib/Headers/intrin.h                               |  90 ------
 lib/Lex/HeaderSearch.cpp                           |   2 +-
 lib/Lex/Preprocessor.cpp                           |   2 +-
 lib/Parse/ParseDecl.cpp                            |   8 +-
 lib/Parse/ParseExpr.cpp                            |   2 +
 lib/Parse/ParsePragma.cpp                          |   6 +-
 lib/Sema/SemaCodeComplete.cpp                      |  10 +-
 lib/Sema/SemaDeclCXX.cpp                           |  75 ++---
 lib/Sema/SemaExpr.cpp                              |   3 +
 lib/Sema/SemaExprCXX.cpp                           |   2 +
 lib/Sema/SemaOverload.cpp                          |  24 +-
 lib/Sema/SemaTemplateDeduction.cpp                 | 244 ++++++++------
 lib/Sema/SemaTemplateInstantiateDecl.cpp           |  42 ++-
 lib/Serialization/ASTReader.cpp                    |  52 +--
 lib/Serialization/ASTWriter.cpp                    | 201 ++++++------
 lib/Serialization/ASTWriterDecl.cpp                |  58 ++--
 lib/Serialization/GeneratePCH.cpp                  |   2 +-
 lib/Serialization/GlobalModuleIndex.cpp            |   4 +-
 lib/StaticAnalyzer/Checkers/DynamicTypeChecker.cpp |  19 +-
 .../Checkers/DynamicTypePropagation.cpp            |  19 +-
 .../Checkers/LocalizationChecker.cpp               |  16 +-
 .../Checkers/MPI-Checker/MPIBugReporter.cpp        |  10 +-
 .../Checkers/MPI-Checker/MPIBugReporter.h          |   8 +-
 .../Checkers/MacOSKeychainAPIChecker.cpp           |  21 +-
 lib/StaticAnalyzer/Checkers/MallocChecker.cpp      |  18 +-
 lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp |  19 +-
 .../Checkers/ObjCSuperDeallocChecker.cpp           |  18 +-
 lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp |  21 +-
 .../Checkers/TestAfterDivZeroChecker.cpp           |  17 +-
 lib/StaticAnalyzer/Checkers/ValistChecker.cpp      |  12 +-
 lib/StaticAnalyzer/Core/BugReporter.cpp            | 298 ++++++++---------
 lib/StaticAnalyzer/Core/BugReporterVisitors.cpp    | 159 ++++-----
 lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp        |   9 +-
 lib/StaticAnalyzer/Core/PathDiagnostic.cpp         |  65 ++--
 lib/StaticAnalyzer/Core/PlistDiagnostics.cpp       |  40 +--
 .../Frontend/CheckerRegistration.cpp               |   2 +-
 lib/StaticAnalyzer/Frontend/ModelInjector.cpp      |   7 +-
 lib/Tooling/Tooling.cpp                            |  15 +-
 test/CodeGen/builtins-ppc-error.c                  |  20 ++
 test/CodeGen/builtins-ppc-p9vector.c               |  47 ++-
 test/CodeGen/catch-undef-behavior.c                |  22 +-
 test/CodeGen/sanitize-recover.c                    |   4 +-
 test/CodeGen/vectorcall.c                          |  78 +++--
 test/CodeGenCXX/dllexport.cpp                      |  12 +
 test/CodeGenCXX/homogeneous-aggregates.cpp         |   6 +-
 test/CodeGenCXX/ubsan-vtable-checks.cpp            |   2 +-
 .../CUDA/v8.0/bin/.keep                            |   0
 .../CUDA/v8.0/include/.keep                        |   0
 .../CUDA/v8.0/lib/.keep                            |   0
 .../v8.0/nvvm/libdevice/libdevice.compute_30.10.bc |   0
 .../v8.0/nvvm/libdevice/libdevice.compute_35.10.bc |   0
 test/Driver/avr-toolchain.c                        |   4 +
 test/Driver/cuda-version-check.cu                  |  22 +-
 test/Driver/cuda-windows.cu                        |  14 +
 test/Index/complete-block-properties.m             |   2 +-
 test/Index/complete-block-property-assignment.m    |  24 +-
 test/OpenMP/nvptx_target_codegen.cpp               | 354 +++++++++++----------
 test/OpenMP/target_codegen.cpp                     |   4 +-
 test/OpenMP/target_codegen_registration.cpp        |  52 +--
 test/OpenMP/teams_distribute_collapse_messages.cpp |   3 +-
 test/Preprocessor/cuda-types.cu                    |  16 +
 test/Preprocessor/init.c                           | 171 ++++++++++
 test/Sema/warn-cast-align.c                        |   8 +
 test/Sema/warn-strict-prototypes.m                 |   5 +-
 test/Sema/warn-thread-safety-analysis.c            |   4 +
 test/SemaCUDA/attr-declspec.cu                     |  34 ++
 test/SemaCUDA/cuda-inherits-calling-conv.cu        |  30 ++
 test/SemaCXX/constant-expression-cxx11.cpp         |   4 +-
 test/SemaCXX/conversion-function.cpp               |   2 +-
 .../cxx0x-initializer-stdinitializerlist.cpp       |  26 +-
 test/SemaCXX/cxx1z-decomposition.cpp               |   5 +
 test/SemaCXX/default-arg-closures.cpp              |   9 +-
 test/SemaCXX/dllexport.cpp                         |  21 ++
 test/SemaCXX/type-definition-in-specifier.cpp      |   6 +-
 test/SemaObjC/block-omitted-return-type.m          |   2 +-
 test/SemaOpenCL/extensions.cl                      |  13 +
 test/SemaTemplate/deduction.cpp                    |  35 ++
 test/SemaTemplate/instantiate-local-class.cpp      |  11 +
 tools/c-index-test/core_main.cpp                   |   5 +-
 tools/clang-import-test/clang-import-test.cpp      |   2 +-
 tools/diagtool/ShowEnabledWarnings.cpp             |   4 +-
 tools/libclang/CIndex.cpp                          |   9 +-
 tools/libclang/CIndexCodeCompletion.cpp            |  19 +-
 tools/libclang/CXIndexDataConsumer.cpp             |   4 +-
 tools/libclang/CXIndexDataConsumer.h               |   2 +-
 tools/libclang/CXTranslationUnit.h                 |   3 +-
 tools/libclang/Indexing.cpp                        |  31 +-
 unittests/AST/ExternalASTSourceTest.cpp            |   4 +-
 unittests/ASTMatchers/ASTMatchersTraversalTest.cpp |   9 +-
 unittests/Basic/SourceManagerTest.cpp              |  24 +-
 unittests/Format/FormatTestJS.cpp                  |   4 +-
 unittests/Frontend/CodeGenActionTest.cpp           |   4 +-
 unittests/Frontend/FrontendActionTest.cpp          |  20 +-
 unittests/Lex/LexerTest.cpp                        |   8 +-
 unittests/Lex/PPCallbacksTest.cpp                  |  18 +-
 unittests/Lex/PPConditionalDirectiveRecordTest.cpp |   8 +-
 utils/TableGen/ClangAttrEmitter.cpp                |  13 +-
 159 files changed, 2879 insertions(+), 1898 deletions(-)
 create mode 100644 test/CodeGen/builtins-ppc-error.c
 create mode 100644 test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/.keep
 create mode 100644 test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/include/.keep
 create mode 100644 test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/lib/.keep
 create mode 100644 test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/nvvm/libdevice/libdevice.compute_30.10.bc
 create mode 100644 test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/nvvm/libdevice/libdevice.compute_35.10.bc
 create mode 100644 test/Driver/avr-toolchain.c
 create mode 100644 test/Driver/cuda-windows.cu
 create mode 100644 test/SemaCUDA/attr-declspec.cu
 create mode 100644 test/SemaCUDA/cuda-inherits-calling-conv.cu

diff --git a/examples/clang-interpreter/main.cpp b/examples/clang-interpreter/main.cpp
index 9b4a257bcba3..f7832291f2b6 100644
--- a/examples/clang-interpreter/main.cpp
+++ b/examples/clang-interpreter/main.cpp
@@ -145,7 +145,7 @@ int main(int argc, const char **argv, char * const *envp) {
 
   // Create a compiler instance to handle the actual work.
   CompilerInstance Clang;
-  Clang.setInvocation(CI.release());
+  Clang.setInvocation(std::move(CI));
 
   // Create the compilers actual diagnostics engine.
   Clang.createDiagnostics();
diff --git a/include/clang/AST/DeclCXX.h b/include/clang/AST/DeclCXX.h
index 06ecd3c37342..0ca08db16299 100644
--- a/include/clang/AST/DeclCXX.h
+++ b/include/clang/AST/DeclCXX.h
@@ -3181,7 +3181,7 @@ public:
   /// Get the using declaration from which this was instantiated. This will
   /// always be an UnresolvedUsingValueDecl or an UnresolvedUsingTypenameDecl
   /// that is a pack expansion.
-  NamedDecl *getInstantiatedFromUsingDecl() { return InstantiatedFrom; }
+  NamedDecl *getInstantiatedFromUsingDecl() const { return InstantiatedFrom; }
 
   /// Get the set of using declarations that this pack expanded into. Note that
   /// some of these may still be unresolved.
diff --git a/include/clang/ASTMatchers/Dynamic/VariantValue.h b/include/clang/ASTMatchers/Dynamic/VariantValue.h
index 9f694d0ce434..2c80b5137320 100644
--- a/include/clang/ASTMatchers/Dynamic/VariantValue.h
+++ b/include/clang/ASTMatchers/Dynamic/VariantValue.h
@@ -119,7 +119,7 @@ class VariantMatcher {
   /// \brief Payload interface to be specialized by each matcher type.
   ///
   /// It follows a similar interface as VariantMatcher itself.
-  class Payload : public RefCountedBase<Payload> {
+  class Payload {
   public:
     virtual ~Payload();
     virtual llvm::Optional<DynTypedMatcher> getSingleMatcher() const = 0;
@@ -208,7 +208,8 @@ public:
   std::string getTypeAsString() const;
 
 private:
-  explicit VariantMatcher(Payload *Value) : Value(Value) {}
+  explicit VariantMatcher(std::shared_ptr<Payload> Value)
+      : Value(std::move(Value)) {}
 
   template <typename T> struct TypedMatcherOps;
 
@@ -216,7 +217,7 @@ private:
   class PolymorphicPayload;
   class VariadicOpPayload;
 
-  IntrusiveRefCntPtr<const Payload> Value;
+  std::shared_ptr<const Payload> Value;
 };
 
 template <typename T>
diff --git a/include/clang/Basic/Attr.td b/include/clang/Basic/Attr.td
index 107a3bdffa65..e3c2b0e45d3d 100644
--- a/include/clang/Basic/Attr.td
+++ b/include/clang/Basic/Attr.td
@@ -601,49 +601,53 @@ def Constructor : InheritableAttr {
   let Documentation = [Undocumented];
 }
 
+// CUDA attributes are spelled __attribute__((attr)) or __declspec(__attr__).
+
 def CUDAConstant : InheritableAttr {
-  let Spellings = [GNU<"constant">];
+  let Spellings = [GNU<"constant">, Declspec<"__constant__">];
   let Subjects = SubjectList<[Var]>;
   let LangOpts = [CUDA];
   let Documentation = [Undocumented];
 }
 
 def CUDACudartBuiltin : IgnoredAttr {
-  let Spellings = [GNU<"cudart_builtin">];
+  let Spellings = [GNU<"cudart_builtin">, Declspec<"__cudart_builtin__">];
   let LangOpts = [CUDA];
 }
 
 def CUDADevice : InheritableAttr {
-  let Spellings = [GNU<"device">];
+  let Spellings = [GNU<"device">, Declspec<"__device__">];
   let Subjects = SubjectList<[Function, Var]>;
   let LangOpts = [CUDA];
   let Documentation = [Undocumented];
 }
 
 def CUDADeviceBuiltin : IgnoredAttr {
-  let Spellings = [GNU<"device_builtin">];
+  let Spellings = [GNU<"device_builtin">, Declspec<"__device_builtin__">];
   let LangOpts = [CUDA];
 }
 
 def CUDADeviceBuiltinSurfaceType : IgnoredAttr {
-  let Spellings = [GNU<"device_builtin_surface_type">];
+  let Spellings = [GNU<"device_builtin_surface_type">,
+                   Declspec<"__device_builtin_surface_type__">];
   let LangOpts = [CUDA];
 }
 
 def CUDADeviceBuiltinTextureType : IgnoredAttr {
-  let Spellings = [GNU<"device_builtin_texture_type">];
+  let Spellings = [GNU<"device_builtin_texture_type">,
+                   Declspec<"__device_builtin_texture_type__">];
   let LangOpts = [CUDA];
 }
 
 def CUDAGlobal : InheritableAttr {
-  let Spellings = [GNU<"global">];
+  let Spellings = [GNU<"global">, Declspec<"__global__">];
   let Subjects = SubjectList<[Function]>;
   let LangOpts = [CUDA];
   let Documentation = [Undocumented];
 }
 
 def CUDAHost : InheritableAttr {
-  let Spellings = [GNU<"host">];
+  let Spellings = [GNU<"host">, Declspec<"__host__">];
   let Subjects = SubjectList<[Function]>;
   let LangOpts = [CUDA];
   let Documentation = [Undocumented];
@@ -657,7 +661,7 @@ def CUDAInvalidTarget : InheritableAttr {
 }
 
 def CUDALaunchBounds : InheritableAttr {
-  let Spellings = [GNU<"launch_bounds">];
+  let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">];
   let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>];
   let LangOpts = [CUDA];
   let Subjects = SubjectList<[ObjCMethod, FunctionLike], WarnDiag,
@@ -669,7 +673,7 @@ def CUDALaunchBounds : InheritableAttr {
 }
 
 def CUDAShared : InheritableAttr {
-  let Spellings = [GNU<"shared">];
+  let Spellings = [GNU<"shared">, Declspec<"__shared__">];
   let Subjects = SubjectList<[Var]>;
   let LangOpts = [CUDA];
   let Documentation = [Undocumented];
@@ -1195,6 +1199,8 @@ def NoThrow : InheritableAttr {
 }
 
 def NvWeak : IgnoredAttr {
+  // No Declspec spelling of this attribute; the CUDA headers use
+  // __attribute__((nv_weak)) unconditionally.
   let Spellings = [GNU<"nv_weak">];
   let LangOpts = [CUDA];
 }
diff --git a/include/clang/Basic/BuiltinsPPC.def b/include/clang/Basic/BuiltinsPPC.def
index 657ea4225aa8..f7cddc03131b 100644
--- a/include/clang/Basic/BuiltinsPPC.def
+++ b/include/clang/Basic/BuiltinsPPC.def
@@ -417,6 +417,9 @@ BUILTIN(__builtin_vsx_xvcvhpsp, "V4fV8Us", "")
 BUILTIN(__builtin_vsx_xvtstdcdp, "V2ULLiV2dIi", "")
 BUILTIN(__builtin_vsx_xvtstdcsp, "V4UiV4fIi", "")
 
+BUILTIN(__builtin_vsx_insertword, "V16UcV4UiV16UcIi", "")
+BUILTIN(__builtin_vsx_extractuword, "V2ULLiV16UcIi", "")
+
 // HTM builtins
 BUILTIN(__builtin_tbegin, "UiUIi", "")
 BUILTIN(__builtin_tend, "UiUIi", "")
diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td
index 610fe0cb4c01..0807bba45fc4 100644
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3377,8 +3377,10 @@ def note_addrof_ovl_candidate_disabled_by_enable_if_attr : Note<
     "candidate function made ineligible by enable_if">;
 def note_ovl_candidate_deduced_mismatch : Note<
     "candidate template ignored: deduced type "
-    "%diff{$ of %ordinal0 parameter does not match adjusted type $ of argument"
-    "|of %ordinal0 parameter does not match adjusted type of argument}1,2%3">;
+    "%diff{$ of %select{|element of }4%ordinal0 parameter does not match "
+    "adjusted type $ of %select{|element of }4argument"
+    "|of %select{|element of }4%ordinal0 parameter does not match "
+    "adjusted type of %select{|element of }4argument}1,2%3">;
 def note_ovl_candidate_non_deduced_mismatch : Note<
     "candidate template ignored: could not match %diff{$ against $|types}0,1">;
 // This note is needed because the above note would sometimes print two
diff --git a/include/clang/CodeGen/BackendUtil.h b/include/clang/CodeGen/BackendUtil.h
index 01721d322098..c6abc6e3f574 100644
--- a/include/clang/CodeGen/BackendUtil.h
+++ b/include/clang/CodeGen/BackendUtil.h
@@ -21,6 +21,7 @@ namespace llvm {
 
 namespace clang {
   class DiagnosticsEngine;
+  class HeaderSearchOptions;
   class CodeGenOptions;
   class TargetOptions;
   class LangOptions;
@@ -34,7 +35,8 @@ namespace clang {
     Backend_EmitObj        ///< Emit native object files
   };
 
-  void EmitBackendOutput(DiagnosticsEngine &Diags, const CodeGenOptions &CGOpts,
+  void EmitBackendOutput(DiagnosticsEngine &Diags, const HeaderSearchOptions &,
+                         const CodeGenOptions &CGOpts,
                          const TargetOptions &TOpts, const LangOptions &LOpts,
                          const llvm::DataLayout &TDesc, llvm::Module *M,
                          BackendAction Action,
diff --git a/include/clang/Driver/ToolChain.h b/include/clang/Driver/ToolChain.h
index cca239c4be2a..ffb0d60a6398 100644
--- a/include/clang/Driver/ToolChain.h
+++ b/include/clang/Driver/ToolChain.h
@@ -139,6 +139,13 @@ public:
   vfs::FileSystem &getVFS() const;
   const llvm::Triple &getTriple() const { return Triple; }
 
+  /// Get the toolchain's aux triple, if it has one.
+  ///
+  /// Exactly what the aux triple represents depends on the toolchain, but for
+  /// example when compiling CUDA code for the GPU, the triple might be NVPTX,
+  /// while the aux triple is the host (CPU) toolchain, e.g. x86-linux-gnu.
+  virtual const llvm::Triple *getAuxTriple() const { return nullptr; }
+
   llvm::Triple::ArchType getArch() const { return Triple.getArch(); }
   StringRef getArchName() const { return Triple.getArchName(); }
   StringRef getPlatform() const { return Triple.getVendorName(); }
diff --git a/include/clang/Frontend/ASTUnit.h b/include/clang/Frontend/ASTUnit.h
index cc8d4e6e3e70..b1cdb46d505b 100644
--- a/include/clang/Frontend/ASTUnit.h
+++ b/include/clang/Frontend/ASTUnit.h
@@ -86,10 +86,10 @@ private:
   IntrusiveRefCntPtr<SourceManager>       SourceMgr;
   std::unique_ptr<HeaderSearch>           HeaderInfo;
   IntrusiveRefCntPtr<TargetInfo>          Target;
-  IntrusiveRefCntPtr<Preprocessor>        PP;
+  std::shared_ptr<Preprocessor>           PP;
   IntrusiveRefCntPtr<ASTContext>          Ctx;
   std::shared_ptr<TargetOptions>          TargetOpts;
-  IntrusiveRefCntPtr<HeaderSearchOptions> HSOpts;
+  std::shared_ptr<HeaderSearchOptions>    HSOpts;
   IntrusiveRefCntPtr<ASTReader> Reader;
   bool HadModuleLoaderFatalFailure;
 
@@ -108,8 +108,8 @@ private:
 
   /// Optional owned invocation, just used to make the invocation used in
   /// LoadFromCommandLine available.
-  IntrusiveRefCntPtr<CompilerInvocation> Invocation;
-  
+  std::shared_ptr<CompilerInvocation> Invocation;
+
   // OnlyLocalDecls - when true, walking this AST should only visit declarations
   // that come from the AST itself, not from included precompiled headers.
   // FIXME: This is temporary; eventually, CIndex will always do this.
@@ -358,22 +358,21 @@ public:
   }
   
   /// \brief Retrieve the allocator used to cache global code completions.
-  IntrusiveRefCntPtr<GlobalCodeCompletionAllocator>
+  std::shared_ptr<GlobalCodeCompletionAllocator>
   getCachedCompletionAllocator() {
     return CachedCompletionAllocator;
   }
 
   CodeCompletionTUInfo &getCodeCompletionTUInfo() {
     if (!CCTUInfo)
-      CCTUInfo.reset(new CodeCompletionTUInfo(
-                                            new GlobalCodeCompletionAllocator));
+      CCTUInfo = llvm::make_unique<CodeCompletionTUInfo>(
+          std::make_shared<GlobalCodeCompletionAllocator>());
     return *CCTUInfo;
   }
 
 private:
   /// \brief Allocator used to store cached code completions.
-  IntrusiveRefCntPtr<GlobalCodeCompletionAllocator>
-    CachedCompletionAllocator;
+  std::shared_ptr<GlobalCodeCompletionAllocator> CachedCompletionAllocator;
 
   std::unique_ptr<CodeCompletionTUInfo> CCTUInfo;
 
@@ -496,12 +495,13 @@ public:
 
   const Preprocessor &getPreprocessor() const { return *PP; }
         Preprocessor &getPreprocessor()       { return *PP; }
+  std::shared_ptr<Preprocessor> getPreprocessorPtr() const { return PP; }
 
   const ASTContext &getASTContext() const { return *Ctx; }
         ASTContext &getASTContext()       { return *Ctx; }
 
   void setASTContext(ASTContext *ctx) { Ctx = ctx; }
-  void setPreprocessor(Preprocessor *pp);
+  void setPreprocessor(std::shared_ptr<Preprocessor> pp);
 
   bool hasSema() const { return (bool)TheSema; }
   Sema &getSema() const { 
@@ -701,11 +701,11 @@ public:
   /// remapped contents of that file.
   typedef std::pair<std::string, llvm::MemoryBuffer *> RemappedFile;
 
-  /// \brief Create a ASTUnit. Gets ownership of the passed CompilerInvocation. 
-  static ASTUnit *create(CompilerInvocation *CI,
-                         IntrusiveRefCntPtr<DiagnosticsEngine> Diags,
-                         bool CaptureDiagnostics,
-                         bool UserFilesAreVolatile);
+  /// \brief Create a ASTUnit. Gets ownership of the passed CompilerInvocation.
+  static std::unique_ptr<ASTUnit>
+  create(std::shared_ptr<CompilerInvocation> CI,
+         IntrusiveRefCntPtr<DiagnosticsEngine> Diags, bool CaptureDiagnostics,
+         bool UserFilesAreVolatile);
 
   /// \brief Create a ASTUnit from an AST file.
   ///
@@ -770,7 +770,7 @@ public:
   /// created ASTUnit was passed in \p Unit then the caller can check that.
   ///
   static ASTUnit *LoadFromCompilerInvocationAction(
-      CompilerInvocation *CI,
+      std::shared_ptr<CompilerInvocation> CI,
       std::shared_ptr<PCHContainerOperations> PCHContainerOps,
       IntrusiveRefCntPtr<DiagnosticsEngine> Diags,
       FrontendAction *Action = nullptr, ASTUnit *Unit = nullptr,
@@ -797,7 +797,7 @@ public:
   // FIXME: Move OnlyLocalDecls, UseBumpAllocator to setters on the ASTUnit, we
   // shouldn't need to specify them at construction time.
   static std::unique_ptr<ASTUnit> LoadFromCompilerInvocation(
-      CompilerInvocation *CI,
+      std::shared_ptr<CompilerInvocation> CI,
       std::shared_ptr<PCHContainerOperations> PCHContainerOps,
       IntrusiveRefCntPtr<DiagnosticsEngine> Diags, FileManager *FileMgr,
       bool OnlyLocalDecls = false, bool CaptureDiagnostics = false,
diff --git a/include/clang/Frontend/CompilerInstance.h b/include/clang/Frontend/CompilerInstance.h
index 3f754d999874..3ebbc61515c6 100644
--- a/include/clang/Frontend/CompilerInstance.h
+++ b/include/clang/Frontend/CompilerInstance.h
@@ -70,7 +70,7 @@ class TargetInfo;
 /// and a long form that takes explicit instances of any required objects.
 class CompilerInstance : public ModuleLoader {
   /// The options used in this compiler instance.
-  IntrusiveRefCntPtr<CompilerInvocation> Invocation;
+  std::shared_ptr<CompilerInvocation> Invocation;
 
   /// The diagnostics engine instance.
   IntrusiveRefCntPtr<DiagnosticsEngine> Diagnostics;
@@ -91,7 +91,7 @@ class CompilerInstance : public ModuleLoader {
   IntrusiveRefCntPtr<SourceManager> SourceMgr;
 
   /// The preprocessor.
-  IntrusiveRefCntPtr<Preprocessor> PP;
+  std::shared_ptr<Preprocessor> PP;
 
   /// The AST context.
   IntrusiveRefCntPtr<ASTContext> Context;
@@ -228,7 +228,7 @@ public:
   }
 
   /// setInvocation - Replace the current invocation.
-  void setInvocation(CompilerInvocation *Value);
+  void setInvocation(std::shared_ptr<CompilerInvocation> Value);
 
   /// \brief Indicates whether we should (re)build the global module index.
   bool shouldBuildGlobalModuleIndex() const;
@@ -288,6 +288,9 @@ public:
   const HeaderSearchOptions &getHeaderSearchOpts() const {
     return Invocation->getHeaderSearchOpts();
   }
+  std::shared_ptr<HeaderSearchOptions> getHeaderSearchOptsPtr() const {
+    return Invocation->getHeaderSearchOptsPtr();
+  }
 
   LangOptions &getLangOpts() {
     return *Invocation->getLangOpts();
@@ -433,13 +436,14 @@ public:
     return *PP;
   }
 
+  std::shared_ptr<Preprocessor> getPreprocessorPtr() { return PP; }
+
   void resetAndLeakPreprocessor() {
-    BuryPointer(PP.get());
-    PP.resetWithoutRelease();
+    BuryPointer(new std::shared_ptr<Preprocessor>(PP));
   }
 
   /// Replace the current preprocessor.
-  void setPreprocessor(Preprocessor *Value);
+  void setPreprocessor(std::shared_ptr<Preprocessor> Value);
 
   /// }
   /// @name ASTContext
@@ -653,7 +657,7 @@ public:
       StringRef Path, StringRef Sysroot, bool DisablePCHValidation,
       bool AllowPCHWithCompilerErrors, Preprocessor &PP, ASTContext &Context,
       const PCHContainerReader &PCHContainerRdr,
-      ArrayRef<IntrusiveRefCntPtr<ModuleFileExtension>> Extensions,
+      ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
       void *DeserializationListener, bool OwnDeserializationListener,
       bool Preamble, bool UseGlobalModuleIndex);
 
diff --git a/include/clang/Frontend/CompilerInvocation.h b/include/clang/Frontend/CompilerInvocation.h
index cb037c26546f..cef7f73ecaa0 100644
--- a/include/clang/Frontend/CompilerInvocation.h
+++ b/include/clang/Frontend/CompilerInvocation.h
@@ -51,7 +51,7 @@ bool ParseDiagnosticArgs(DiagnosticOptions &Opts, llvm::opt::ArgList &Args,
                          bool DefaultDiagColor = true,
                          bool DefaultShowOpt = true);
 
-class CompilerInvocationBase : public RefCountedBase<CompilerInvocation> {
+class CompilerInvocationBase {
   void operator=(const CompilerInvocationBase &) = delete;
 
 public:
@@ -65,10 +65,10 @@ public:
   IntrusiveRefCntPtr<DiagnosticOptions> DiagnosticOpts;
 
   /// Options controlling the \#include directive.
-  IntrusiveRefCntPtr<HeaderSearchOptions> HeaderSearchOpts;
+  std::shared_ptr<HeaderSearchOptions> HeaderSearchOpts;
 
   /// Options controlling the preprocessor (aside from \#include handling).
-  IntrusiveRefCntPtr<PreprocessorOptions> PreprocessorOpts;
+  std::shared_ptr<PreprocessorOptions> PreprocessorOpts;
 
   CompilerInvocationBase();
   ~CompilerInvocationBase();
@@ -89,7 +89,13 @@ public:
   const HeaderSearchOptions &getHeaderSearchOpts() const {
     return *HeaderSearchOpts;
   }
+  std::shared_ptr<HeaderSearchOptions> getHeaderSearchOptsPtr() const {
+    return HeaderSearchOpts;
+  }
 
+  std::shared_ptr<PreprocessorOptions> getPreprocessorOptsPtr() {
+    return PreprocessorOpts;
+  }
   PreprocessorOptions &getPreprocessorOpts() { return *PreprocessorOpts; }
   const PreprocessorOptions &getPreprocessorOpts() const {
     return *PreprocessorOpts;
diff --git a/include/clang/Frontend/FrontendOptions.h b/include/clang/Frontend/FrontendOptions.h
index aad397526a03..9c960bb0c305 100644
--- a/include/clang/Frontend/FrontendOptions.h
+++ b/include/clang/Frontend/FrontendOptions.h
@@ -243,7 +243,7 @@ public:
   std::vector<std::string> Plugins;
 
   /// The list of module file extensions.
-  std::vector<IntrusiveRefCntPtr<ModuleFileExtension>> ModuleFileExtensions;
+  std::vector<std::shared_ptr<ModuleFileExtension>> ModuleFileExtensions;
 
   /// \brief The list of module map files to load before processing the input.
   std::vector<std::string> ModuleMapFiles;
diff --git a/include/clang/Frontend/Utils.h b/include/clang/Frontend/Utils.h
index 60419ff9b41d..0ee46846c804 100644
--- a/include/clang/Frontend/Utils.h
+++ b/include/clang/Frontend/Utils.h
@@ -184,10 +184,10 @@ createChainedIncludesSource(CompilerInstance &CI,
 ///
 /// \return A CompilerInvocation, or 0 if none was built for the given
 /// argument vector.
-CompilerInvocation *
+std::unique_ptr<CompilerInvocation>
 createInvocationFromCommandLine(ArrayRef<const char *> Args,
-                            IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
-                                IntrusiveRefCntPtr<DiagnosticsEngine>());
+                                IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
+                                    IntrusiveRefCntPtr<DiagnosticsEngine>());
 
 /// Return the value of the last argument as an integer, or a default. If Diags
 /// is non-null, emits an error if the argument is given, but non-integral.
diff --git a/include/clang/Lex/HeaderSearch.h b/include/clang/Lex/HeaderSearch.h
index b145d7bae15a..4df3e783117a 100644
--- a/include/clang/Lex/HeaderSearch.h
+++ b/include/clang/Lex/HeaderSearch.h
@@ -147,7 +147,7 @@ class HeaderSearch {
   };
 
   /// \brief Header-search options used to initialize this header search.
-  IntrusiveRefCntPtr<HeaderSearchOptions> HSOpts;
+  std::shared_ptr<HeaderSearchOptions> HSOpts;
 
   DiagnosticsEngine &Diags;
   FileManager &FileMgr;
@@ -248,7 +248,7 @@ class HeaderSearch {
   friend class DirectoryLookup;
   
 public:
-  HeaderSearch(IntrusiveRefCntPtr<HeaderSearchOptions> HSOpts,
+  HeaderSearch(std::shared_ptr<HeaderSearchOptions> HSOpts,
                SourceManager &SourceMgr, DiagnosticsEngine &Diags,
                const LangOptions &LangOpts, const TargetInfo *Target);
   ~HeaderSearch();
diff --git a/include/clang/Lex/HeaderSearchOptions.h b/include/clang/Lex/HeaderSearchOptions.h
index 815b68c60e80..e99980537348 100644
--- a/include/clang/Lex/HeaderSearchOptions.h
+++ b/include/clang/Lex/HeaderSearchOptions.h
@@ -44,7 +44,7 @@ namespace frontend {
 
 /// HeaderSearchOptions - Helper class for storing options related to the
 /// initialization of the HeaderSearch object.
-class HeaderSearchOptions : public RefCountedBase<HeaderSearchOptions> {
+class HeaderSearchOptions {
 public:
   struct Entry {
     std::string Path;
diff --git a/include/clang/Lex/Preprocessor.h b/include/clang/Lex/Preprocessor.h
index bb71f49290b4..7ce1aad36d12 100644
--- a/include/clang/Lex/Preprocessor.h
+++ b/include/clang/Lex/Preprocessor.h
@@ -94,8 +94,8 @@ enum MacroUse {
 /// Lexers know only about tokens within a single source file, and don't
 /// know anything about preprocessor-level issues like the \#include stack,
 /// token expansion, etc.
-class Preprocessor : public RefCountedBase<Preprocessor> {
-  IntrusiveRefCntPtr<PreprocessorOptions> PPOpts;
+class Preprocessor {
+  std::shared_ptr<PreprocessorOptions> PPOpts;
   DiagnosticsEngine        *Diags;
   LangOptions       &LangOpts;
   const TargetInfo  *Target;
@@ -650,10 +650,9 @@ class Preprocessor : public RefCountedBase<Preprocessor> {
   void updateOutOfDateIdentifier(IdentifierInfo &II) const;
 
 public:
-  Preprocessor(IntrusiveRefCntPtr<PreprocessorOptions> PPOpts,
-               DiagnosticsEngine &diags, LangOptions &opts,
-               SourceManager &SM, HeaderSearch &Headers,
-               ModuleLoader &TheModuleLoader,
+  Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts,
+               DiagnosticsEngine &diags, LangOptions &opts, SourceManager &SM,
+               HeaderSearch &Headers, ModuleLoader &TheModuleLoader,
                IdentifierInfoLookup *IILookup = nullptr,
                bool OwnsHeaderSearch = false,
                TranslationUnitKind TUKind = TU_Complete);
diff --git a/include/clang/Lex/PreprocessorOptions.h b/include/clang/Lex/PreprocessorOptions.h
index de652cccb83a..58d79f7ff81a 100644
--- a/include/clang/Lex/PreprocessorOptions.h
+++ b/include/clang/Lex/PreprocessorOptions.h
@@ -40,7 +40,7 @@ enum ObjCXXARCStandardLibraryKind {
   
 /// PreprocessorOptions - This class is used for passing the various options
 /// used in preprocessor initialization to InitializePreprocessor().
-class PreprocessorOptions : public RefCountedBase<PreprocessorOptions> {
+class PreprocessorOptions {
 public:
   std::vector<std::pair<std::string, bool/*isUndef*/> > Macros;
   std::vector<std::string> Includes;
@@ -117,7 +117,7 @@ public:
   ObjCXXARCStandardLibraryKind ObjCXXARCStandardLibrary;
     
   /// \brief Records the set of modules
-  class FailedModulesSet : public RefCountedBase<FailedModulesSet> {
+  class FailedModulesSet {
     llvm::StringSet<> Failed;
 
   public:
@@ -136,7 +136,7 @@ public:
   /// to (re)build modules, so that once a module fails to build anywhere,
   /// other instances will see that the module has failed and won't try to
   /// build it again.
-  IntrusiveRefCntPtr<FailedModulesSet> FailedModules;
+  std::shared_ptr<FailedModulesSet> FailedModules;
 
 public:
   PreprocessorOptions() : UsePredefines(true), DetailedRecord(false),
diff --git a/include/clang/Sema/CodeCompleteConsumer.h b/include/clang/Sema/CodeCompleteConsumer.h
index b80924ea11fc..dee53dc14a8c 100644
--- a/include/clang/Sema/CodeCompleteConsumer.h
+++ b/include/clang/Sema/CodeCompleteConsumer.h
@@ -509,23 +509,18 @@ public:
 };
 
 /// \brief Allocator for a cached set of global code completions.
-class GlobalCodeCompletionAllocator 
-  : public CodeCompletionAllocator,
-    public RefCountedBase<GlobalCodeCompletionAllocator>
-{
-
-};
+class GlobalCodeCompletionAllocator : public CodeCompletionAllocator {};
 
 class CodeCompletionTUInfo {
   llvm::DenseMap<const DeclContext *, StringRef> ParentNames;
-  IntrusiveRefCntPtr<GlobalCodeCompletionAllocator> AllocatorRef;
+  std::shared_ptr<GlobalCodeCompletionAllocator> AllocatorRef;
 
 public:
   explicit CodeCompletionTUInfo(
-      IntrusiveRefCntPtr<GlobalCodeCompletionAllocator> Allocator)
+      std::shared_ptr<GlobalCodeCompletionAllocator> Allocator)
       : AllocatorRef(std::move(Allocator)) {}
 
-  IntrusiveRefCntPtr<GlobalCodeCompletionAllocator> getAllocatorRef() const {
+  std::shared_ptr<GlobalCodeCompletionAllocator> getAllocatorRef() const {
     return AllocatorRef;
   }
   CodeCompletionAllocator &getAllocator() const {
@@ -965,8 +960,8 @@ public:
   /// results to the given raw output stream.
   PrintingCodeCompleteConsumer(const CodeCompleteOptions &CodeCompleteOpts,
                                raw_ostream &OS)
-    : CodeCompleteConsumer(CodeCompleteOpts, false), OS(OS),
-      CCTUInfo(new GlobalCodeCompletionAllocator) {}
+      : CodeCompleteConsumer(CodeCompleteOpts, false), OS(OS),
+        CCTUInfo(std::make_shared<GlobalCodeCompletionAllocator>()) {}
 
   /// \brief Prints the finalized code-completion results.
   void ProcessCodeCompleteResults(Sema &S, CodeCompletionContext Context,
diff --git a/include/clang/Sema/Ownership.h b/include/clang/Sema/Ownership.h
index 92ea5296c45b..fd46de870fb4 100644
--- a/include/clang/Sema/Ownership.h
+++ b/include/clang/Sema/Ownership.h
@@ -153,8 +153,8 @@ namespace clang {
     ActionResult(const DiagnosticBuilder &) : Val(PtrTy()), Invalid(true) {}
 
     // These two overloads prevent void* -> bool conversions.
-    ActionResult(const void *);
-    ActionResult(volatile void *);
+    ActionResult(const void *) = delete;
+    ActionResult(volatile void *) = delete;
 
     bool isInvalid() const { return Invalid; }
     bool isUsable() const { return !Invalid && Val; }
@@ -192,8 +192,8 @@ namespace clang {
     ActionResult(const DiagnosticBuilder &) : PtrWithInvalid(0x01) { }
 
     // These two overloads prevent void* -> bool conversions.
-    ActionResult(const void *);
-    ActionResult(volatile void *);
+    ActionResult(const void *) = delete;
+    ActionResult(volatile void *) = delete;
 
     bool isInvalid() const { return PtrWithInvalid & 0x01; }
     bool isUsable() const { return PtrWithInvalid > 0x01; }
diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index 3762253ef113..ca984a360a60 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -6564,6 +6564,10 @@ public:
     /// \brief After substituting deduced template arguments, a dependent
     /// parameter type did not match the corresponding argument.
     TDK_DeducedMismatch,
+    /// \brief After substituting deduced template arguments, an element of
+    /// a dependent parameter type did not match the corresponding element
+    /// of the corresponding argument (when deducing from an initializer list).
+    TDK_DeducedMismatchNested,
     /// \brief A non-depnedent component of the parameter did not match the
     /// corresponding component of the argument.
     TDK_NonDeducedMismatch,
@@ -6602,13 +6606,14 @@ public:
   /// brief A function argument from which we performed template argument
   // deduction for a call.
   struct OriginalCallArg {
-    OriginalCallArg(QualType OriginalParamType,
-                    unsigned ArgIdx,
-                    QualType OriginalArgType)
-      : OriginalParamType(OriginalParamType), ArgIdx(ArgIdx),
-        OriginalArgType(OriginalArgType) { }
+    OriginalCallArg(QualType OriginalParamType, bool DecomposedParam,
+                    unsigned ArgIdx, QualType OriginalArgType)
+        : OriginalParamType(OriginalParamType),
+          DecomposedParam(DecomposedParam), ArgIdx(ArgIdx),
+          OriginalArgType(OriginalArgType) {}
 
     QualType OriginalParamType;
+    bool DecomposedParam;
     unsigned ArgIdx;
     QualType OriginalArgType;
   };
diff --git a/include/clang/Serialization/ASTReader.h b/include/clang/Serialization/ASTReader.h
index 5230e2ae0013..93994e2c519c 100644
--- a/include/clang/Serialization/ASTReader.h
+++ b/include/clang/Serialization/ASTReader.h
@@ -384,8 +384,8 @@ private:
   std::unique_ptr<ASTReaderListener> Listener;
 
   /// \brief The receiver of deserialization events.
-  ASTDeserializationListener *DeserializationListener;
-  bool OwnsDeserializationListener;
+  ASTDeserializationListener *DeserializationListener = nullptr;
+  bool OwnsDeserializationListener = false;
 
   SourceManager &SourceMgr;
   FileManager &FileMgr;
@@ -394,7 +394,7 @@ private:
 
   /// \brief The semantic analysis object that will be processing the
   /// AST files and the translation unit that uses it.
-  Sema *SemaObj;
+  Sema *SemaObj = nullptr;
 
   /// \brief The preprocessor that will be loading the source file.
   Preprocessor &PP;
@@ -403,7 +403,7 @@ private:
   ASTContext &Context;
 
   /// \brief The AST consumer.
-  ASTConsumer *Consumer;
+  ASTConsumer *Consumer = nullptr;
 
   /// \brief The module manager which manages modules and their dependencies
   ModuleManager ModuleMgr;
@@ -414,7 +414,7 @@ private:
   IdentifierResolver DummyIdResolver;
 
   /// A mapping from extension block names to module file extensions.
-  llvm::StringMap<IntrusiveRefCntPtr<ModuleFileExtension>> ModuleFileExtensions;
+  llvm::StringMap<std::shared_ptr<ModuleFileExtension>> ModuleFileExtensions;
 
   /// \brief A timer used to track the time spent deserializing.
   std::unique_ptr<llvm::Timer> ReadTimer;
@@ -802,10 +802,10 @@ private:
   SourceLocation OptimizeOffPragmaLocation;
 
   /// \brief The PragmaMSStructKind pragma ms_struct state if set, or -1.
-  int PragmaMSStructState;
+  int PragmaMSStructState = -1;
 
   /// \brief The PragmaMSPointersToMembersKind pragma pointers_to_members state.
-  int PragmaMSPointersToMembersState;
+  int PragmaMSPointersToMembersState = -1;
   SourceLocation PointersToMembersPragmaLocation;
 
   /// \brief The OpenCL extension settings.
@@ -870,10 +870,10 @@ private:
   bool UseGlobalIndex;
 
   /// \brief Whether we have tried loading the global module index yet.
-  bool TriedLoadingGlobalIndex;
+  bool TriedLoadingGlobalIndex = false;
 
   ///\brief Whether we are currently processing update records.
-  bool ProcessingUpdateRecords;
+  bool ProcessingUpdateRecords = false;
 
   typedef llvm::DenseMap<unsigned, SwitchCase *> SwitchCaseMapTy;
   /// \brief Mapping from switch-case IDs in the chain to switch-case statements
@@ -886,73 +886,73 @@ private:
 
   /// \brief The number of source location entries de-serialized from
   /// the PCH file.
-  unsigned NumSLocEntriesRead;
+  unsigned NumSLocEntriesRead = 0;
 
   /// \brief The number of source location entries in the chain.
-  unsigned TotalNumSLocEntries;
+  unsigned TotalNumSLocEntries = 0;
 
   /// \brief The number of statements (and expressions) de-serialized
   /// from the chain.
-  unsigned NumStatementsRead;
+  unsigned NumStatementsRead = 0;
 
   /// \brief The total number of statements (and expressions) stored
   /// in the chain.
-  unsigned TotalNumStatements;
+  unsigned TotalNumStatements = 0;
 
   /// \brief The number of macros de-serialized from the chain.
-  unsigned NumMacrosRead;
+  unsigned NumMacrosRead = 0;
 
   /// \brief The total number of macros stored in the chain.
-  unsigned TotalNumMacros;
+  unsigned TotalNumMacros = 0;
 
   /// \brief The number of lookups into identifier tables.
-  unsigned NumIdentifierLookups;
+  unsigned NumIdentifierLookups = 0;
 
   /// \brief The number of lookups into identifier tables that succeed.
-  unsigned NumIdentifierLookupHits;
+  unsigned NumIdentifierLookupHits = 0;
 
   /// \brief The number of selectors that have been read.
-  unsigned NumSelectorsRead;
+  unsigned NumSelectorsRead = 0;
 
   /// \brief The number of method pool entries that have been read.
-  unsigned NumMethodPoolEntriesRead;
+  unsigned NumMethodPoolEntriesRead = 0;
 
   /// \brief The number of times we have looked up a selector in the method
   /// pool.
-  unsigned NumMethodPoolLookups;
+  unsigned NumMethodPoolLookups = 0;
 
   /// \brief The number of times we have looked up a selector in the method
   /// pool and found something.
-  unsigned NumMethodPoolHits;
+  unsigned NumMethodPoolHits = 0;
 
   /// \brief The number of times we have looked up a selector in the method
   /// pool within a specific module.
-  unsigned NumMethodPoolTableLookups;
+  unsigned NumMethodPoolTableLookups = 0;
 
   /// \brief The number of times we have looked up a selector in the method
   /// pool within a specific module and found something.
-  unsigned NumMethodPoolTableHits;
+  unsigned NumMethodPoolTableHits = 0;
 
   /// \brief The total number of method pool entries in the selector table.
-  unsigned TotalNumMethodPoolEntries;
+  unsigned TotalNumMethodPoolEntries = 0;
 
   /// Number of lexical decl contexts read/total.
-  unsigned NumLexicalDeclContextsRead, TotalLexicalDeclContexts;
+  unsigned NumLexicalDeclContextsRead = 0, TotalLexicalDeclContexts = 0;
 
   /// Number of visible decl contexts read/total.
-  unsigned NumVisibleDeclContextsRead, TotalVisibleDeclContexts;
+  unsigned NumVisibleDeclContextsRead = 0, TotalVisibleDeclContexts = 0;
 
   /// Total size of modules, in bits, currently loaded
-  uint64_t TotalModulesSizeInBits;
+  uint64_t TotalModulesSizeInBits = 0;
 
   /// \brief Number of Decl/types that are currently deserializing.
-  unsigned NumCurrentElementsDeserializing;
+  unsigned NumCurrentElementsDeserializing = 0;
 
   /// \brief Set true while we are in the process of passing deserialized
   /// "interesting" decls to consumer inside FinishedDeserializing().
   /// This is used as a guard to avoid recursively repeating the process of
   /// passing decls to consumer.
-  bool PassingDeclsToConsumer;
+  bool PassingDeclsToConsumer = false;
 
   /// \brief The set of identifiers that were read while the AST reader was
   /// (recursively) loading declarations.
@@ -1055,7 +1055,7 @@ private:
   };
 
   /// \brief What kind of records we are reading.
-  ReadingKind ReadingKind;
+  ReadingKind ReadingKind = Read_None;
 
   /// \brief RAII object to change the reading kind.
   class ReadingKindTracker {
@@ -1366,7 +1366,7 @@ public:
   /// deserializing.
   ASTReader(Preprocessor &PP, ASTContext &Context,
             const PCHContainerReader &PCHContainerRdr,
-            ArrayRef<IntrusiveRefCntPtr<ModuleFileExtension>> Extensions,
+            ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
             StringRef isysroot = "", bool DisableValidation = false,
             bool AllowASTWithCompilerErrors = false,
             bool AllowConfigurationMismatch = false,
diff --git a/include/clang/Serialization/ASTWriter.h b/include/clang/Serialization/ASTWriter.h
index 1469555ec21e..0d6b0268109d 100644
--- a/include/clang/Serialization/ASTWriter.h
+++ b/include/clang/Serialization/ASTWriter.h
@@ -107,16 +107,16 @@ private:
   llvm::BitstreamWriter &Stream;
 
   /// \brief The ASTContext we're writing.
-  ASTContext *Context;
+  ASTContext *Context = nullptr;
 
   /// \brief The preprocessor we're writing.
-  Preprocessor *PP;
+  Preprocessor *PP = nullptr;
 
   /// \brief The reader of existing AST files, if we're chaining.
-  ASTReader *Chain;
+  ASTReader *Chain = nullptr;
 
   /// \brief The module we're currently writing, if any.
-  Module *WritingModule;
+  Module *WritingModule = nullptr;
 
   /// \brief The base directory for any relative paths we emit.
   std::string BaseDirectory;
@@ -129,14 +129,14 @@ private:
 
   /// \brief Indicates when the AST writing is actively performing
   /// serialization, rather than just queueing updates.
-  bool WritingAST;
+  bool WritingAST = false;
 
   /// \brief Indicates that we are done serializing the collection of decls
   /// and types to emit.
-  bool DoneWritingDeclsAndTypes;
+  bool DoneWritingDeclsAndTypes = false;
 
   /// \brief Indicates that the AST contained compiler errors.
-  bool ASTHasCompilerErrors;
+  bool ASTHasCompilerErrors = false;
 
   /// \brief Mapping from input file entries to the index into the
   /// offset table where information about that input file is stored.
@@ -170,10 +170,10 @@ private:
   std::queue<DeclOrType> DeclTypesToEmit;
 
   /// \brief The first ID number we can use for our own declarations.
-  serialization::DeclID FirstDeclID;
+  serialization::DeclID FirstDeclID = serialization::NUM_PREDEF_DECL_IDS;
 
   /// \brief The decl ID that will be assigned to the next new decl.
-  serialization::DeclID NextDeclID;
+  serialization::DeclID NextDeclID = FirstDeclID;
 
   /// \brief Map that provides the ID numbers of each declaration within
   /// the output stream, as well as those deserialized from a chained PCH.
@@ -205,10 +205,10 @@ private:
   void associateDeclWithFile(const Decl *D, serialization::DeclID);
 
   /// \brief The first ID number we can use for our own types.
-  serialization::TypeID FirstTypeID;
+  serialization::TypeID FirstTypeID = serialization::NUM_PREDEF_TYPE_IDS;
 
   /// \brief The type ID that will be assigned to the next new type.
-  serialization::TypeID NextTypeID;
+  serialization::TypeID NextTypeID = FirstTypeID;
 
   /// \brief Map that provides the ID numbers of each type within the
   /// output stream, plus those deserialized from a chained PCH.
@@ -226,10 +226,10 @@ private:
   std::vector<uint32_t> TypeOffsets;
 
   /// \brief The first ID number we can use for our own identifiers.
-  serialization::IdentID FirstIdentID;
+  serialization::IdentID FirstIdentID = serialization::NUM_PREDEF_IDENT_IDS;
 
   /// \brief The identifier ID that will be assigned to the next new identifier.
-  serialization::IdentID NextIdentID;
+  serialization::IdentID NextIdentID = FirstIdentID;
 
   /// \brief Map that provides the ID numbers of each identifier in
   /// the output stream.
@@ -240,10 +240,10 @@ private:
   llvm::MapVector<const IdentifierInfo *, serialization::IdentID> IdentifierIDs;
 
   /// \brief The first ID number we can use for our own macros.
-  serialization::MacroID FirstMacroID;
+  serialization::MacroID FirstMacroID = serialization::NUM_PREDEF_MACRO_IDS;
 
   /// \brief The identifier ID that will be assigned to the next new identifier.
-  serialization::MacroID NextMacroID;
+  serialization::MacroID NextMacroID = FirstMacroID;
 
   /// \brief Map that provides the ID numbers of each macro.
   llvm::DenseMap<MacroInfo *, serialization::MacroID> MacroIDs;
@@ -275,16 +275,18 @@ private:
   std::vector<uint32_t> IdentifierOffsets;
 
   /// \brief The first ID number we can use for our own submodules.
-  serialization::SubmoduleID FirstSubmoduleID;
-  
+  serialization::SubmoduleID FirstSubmoduleID =
+      serialization::NUM_PREDEF_SUBMODULE_IDS;
+
   /// \brief The submodule ID that will be assigned to the next new submodule.
-  serialization::SubmoduleID NextSubmoduleID;
+  serialization::SubmoduleID NextSubmoduleID = FirstSubmoduleID;
 
   /// \brief The first ID number we can use for our own selectors.
-  serialization::SelectorID FirstSelectorID;
+  serialization::SelectorID FirstSelectorID =
+      serialization::NUM_PREDEF_SELECTOR_IDS;
 
   /// \brief The selector ID that will be assigned to the next new selector.
-  serialization::SelectorID NextSelectorID;
+  serialization::SelectorID NextSelectorID = FirstSelectorID;
 
   /// \brief Map that provides the ID numbers of each Selector.
   llvm::MapVector<Selector, serialization::SelectorID> SelectorIDs;
@@ -394,18 +396,18 @@ private:
   llvm::DenseMap<SwitchCase *, unsigned> SwitchCaseIDs;
 
   /// \brief The number of statements written to the AST file.
-  unsigned NumStatements;
+  unsigned NumStatements = 0;
 
   /// \brief The number of macros written to the AST file.
-  unsigned NumMacros;
+  unsigned NumMacros = 0;
 
   /// \brief The number of lexical declcontexts written to the AST
   /// file.
-  unsigned NumLexicalDeclContexts;
+  unsigned NumLexicalDeclContexts = 0;
 
   /// \brief The number of visible declcontexts written to the AST
   /// file.
-  unsigned NumVisibleDeclContexts;
+  unsigned NumVisibleDeclContexts = 0;
 
   /// \brief A mapping from each known submodule to its ID number, which will
   /// be a positive integer.
@@ -436,8 +438,8 @@ private:
   void WritePragmaDiagnosticMappings(const DiagnosticsEngine &Diag,
                                      bool isModule);
 
-  unsigned TypeExtQualAbbrev;
-  unsigned TypeFunctionProtoAbbrev;
+  unsigned TypeExtQualAbbrev = 0;
+  unsigned TypeFunctionProtoAbbrev = 0;
   void WriteTypeAbbrevs();
   void WriteType(QualType T);
 
@@ -470,22 +472,22 @@ private:
   void WriteModuleFileExtension(Sema &SemaRef,
                                 ModuleFileExtensionWriter &Writer);
 
-  unsigned DeclParmVarAbbrev;
-  unsigned DeclContextLexicalAbbrev;
-  unsigned DeclContextVisibleLookupAbbrev;
-  unsigned UpdateVisibleAbbrev;
-  unsigned DeclRecordAbbrev;
-  unsigned DeclTypedefAbbrev;
-  unsigned DeclVarAbbrev;
-  unsigned DeclFieldAbbrev;
-  unsigned DeclEnumAbbrev;
-  unsigned DeclObjCIvarAbbrev;
-  unsigned DeclCXXMethodAbbrev;
-
-  unsigned DeclRefExprAbbrev;
-  unsigned CharacterLiteralAbbrev;
-  unsigned IntegerLiteralAbbrev;
-  unsigned ExprImplicitCastAbbrev;
+  unsigned DeclParmVarAbbrev = 0;
+  unsigned DeclContextLexicalAbbrev = 0;
+  unsigned DeclContextVisibleLookupAbbrev = 0;
+  unsigned UpdateVisibleAbbrev = 0;
+  unsigned DeclRecordAbbrev = 0;
+  unsigned DeclTypedefAbbrev = 0;
+  unsigned DeclVarAbbrev = 0;
+  unsigned DeclFieldAbbrev = 0;
+  unsigned DeclEnumAbbrev = 0;
+  unsigned DeclObjCIvarAbbrev = 0;
+  unsigned DeclCXXMethodAbbrev = 0;
+
+  unsigned DeclRefExprAbbrev = 0;
+  unsigned CharacterLiteralAbbrev = 0;
+  unsigned IntegerLiteralAbbrev = 0;
+  unsigned ExprImplicitCastAbbrev = 0;
 
   void WriteDeclAbbrevs();
   void WriteDecl(ASTContext &Context, Decl *D);
@@ -498,7 +500,7 @@ public:
   /// \brief Create a new precompiled header writer that outputs to
   /// the given bitstream.
   ASTWriter(llvm::BitstreamWriter &Stream,
-            ArrayRef<llvm::IntrusiveRefCntPtr<ModuleFileExtension>> Extensions,
+            ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
             bool IncludeTimestamps = true);
   ~ASTWriter() override;
 
@@ -934,13 +936,10 @@ protected:
   SmallVectorImpl<char> &getPCH() const { return Buffer->Data; }
 
 public:
-  PCHGenerator(
-    const Preprocessor &PP, StringRef OutputFile,
-    StringRef isysroot,
-    std::shared_ptr<PCHBuffer> Buffer,
-    ArrayRef<llvm::IntrusiveRefCntPtr<ModuleFileExtension>> Extensions,
-    bool AllowASTWithErrors = false,
-    bool IncludeTimestamps = true);
+  PCHGenerator(const Preprocessor &PP, StringRef OutputFile, StringRef isysroot,
+               std::shared_ptr<PCHBuffer> Buffer,
+               ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
+               bool AllowASTWithErrors = false, bool IncludeTimestamps = true);
   ~PCHGenerator() override;
   void InitializeSema(Sema &S) override { SemaPtr = &S; }
   void HandleTranslationUnit(ASTContext &Ctx) override;
diff --git a/include/clang/Serialization/ModuleFileExtension.h b/include/clang/Serialization/ModuleFileExtension.h
index ba2e2fd0d9f1..f7bdcec598f1 100644
--- a/include/clang/Serialization/ModuleFileExtension.h
+++ b/include/clang/Serialization/ModuleFileExtension.h
@@ -60,7 +60,7 @@ class ModuleFileExtensionWriter;
 /// compiled module files (.pcm) and precompiled headers (.pch) via a
 /// custom writer that can then be accessed via a custom reader when
 /// the module file or precompiled header is loaded.
-class ModuleFileExtension : public llvm::RefCountedBase<ModuleFileExtension> {
+class ModuleFileExtension {
 public:
   virtual ~ModuleFileExtension();
 
diff --git a/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h b/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
index 73f4dd5a3e91..0f1eb096c495 100644
--- a/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
+++ b/include/clang/StaticAnalyzer/Core/BugReporter/BugReporter.h
@@ -66,8 +66,7 @@ public:
   typedef SmallVector<std::unique_ptr<BugReporterVisitor>, 8> VisitorList;
   typedef VisitorList::iterator visitor_iterator;
   typedef SmallVector<StringRef, 2> ExtraTextList;
-  typedef SmallVector<llvm::IntrusiveRefCntPtr<PathDiagnosticNotePiece>, 4>
-      NoteList;
+  typedef SmallVector<std::shared_ptr<PathDiagnosticNotePiece>, 4> NoteList;
 
 protected:
   friend class BugReporter;
@@ -268,12 +267,12 @@ public:
   /// the extra note should appear.
   void addNote(StringRef Msg, const PathDiagnosticLocation &Pos,
                ArrayRef<SourceRange> Ranges) {
-    PathDiagnosticNotePiece *P = new PathDiagnosticNotePiece(Pos, Msg);
+    auto P = std::make_shared<PathDiagnosticNotePiece>(Pos, Msg);
 
     for (const auto &R : Ranges)
       P->addRange(R);
 
-    Notes.push_back(P);
+    Notes.push_back(std::move(P));
   }
 
   // FIXME: Instead of making an override, we could have default-initialized
diff --git a/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitor.h b/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitor.h
index 8c3a1d0d4b40..b72bce5fc9f8 100644
--- a/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitor.h
+++ b/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitor.h
@@ -59,10 +59,9 @@ public:
   ///
   /// The last parameter can be used to register a new visitor with the given
   /// BugReport while processing a node.
-  virtual PathDiagnosticPiece *VisitNode(const ExplodedNode *Succ,
-                                         const ExplodedNode *Pred,
-                                         BugReporterContext &BRC,
-                                         BugReport &BR) = 0;
+  virtual std::shared_ptr<PathDiagnosticPiece>
+  VisitNode(const ExplodedNode *Succ, const ExplodedNode *Pred,
+            BugReporterContext &BRC, BugReport &BR) = 0;
 
   /// \brief Provide custom definition for the final diagnostic piece on the
   /// path - the piece, which is displayed before the path is expanded.
@@ -121,10 +120,10 @@ public:
 
   void Profile(llvm::FoldingSetNodeID &ID) const override;
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                 const ExplodedNode *PrevN,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override;
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                 const ExplodedNode *PrevN,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override;
 };
 
 class TrackConstraintBRVisitor final
@@ -150,10 +149,10 @@ public:
   /// to make all PathDiagnosticPieces created by this visitor.
   static const char *getTag();
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                 const ExplodedNode *PrevN,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override;
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                 const ExplodedNode *PrevN,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override;
 
 private:
   /// Checks if the constraint is valid in the current state.
@@ -172,10 +171,10 @@ public:
     ID.AddPointer(&x);
   }
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                 const ExplodedNode *PrevN,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override;
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                 const ExplodedNode *PrevN,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override;
 
   /// If the statement is a message send expression with nil receiver, returns
   /// the receiver expression. Returns NULL otherwise.
@@ -200,49 +199,38 @@ public:
   /// to make all PathDiagnosticPieces created by this visitor.
   static const char *getTag();
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                 const ExplodedNode *Prev,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override;
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                 const ExplodedNode *Prev,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override;
 
-  PathDiagnosticPiece *VisitNodeImpl(const ExplodedNode *N,
-                                     const ExplodedNode *Prev,
-                                     BugReporterContext &BRC,
-                                     BugReport &BR);
-  
-  PathDiagnosticPiece *VisitTerminator(const Stmt *Term,
-                                       const ExplodedNode *N,
-                                       const CFGBlock *srcBlk,
-                                       const CFGBlock *dstBlk,
-                                       BugReport &R,
-                                       BugReporterContext &BRC);
-
-  PathDiagnosticPiece *VisitTrueTest(const Expr *Cond,
-                                     bool tookTrue,
-                                     BugReporterContext &BRC,
-                                     BugReport &R,
-                                     const ExplodedNode *N);
-
-  PathDiagnosticPiece *VisitTrueTest(const Expr *Cond,
-                                     const DeclRefExpr *DR,
-                                     const bool tookTrue,
-                                     BugReporterContext &BRC,
-                                     BugReport &R,
-                                     const ExplodedNode *N);
-
-  PathDiagnosticPiece *VisitTrueTest(const Expr *Cond,
-                                     const BinaryOperator *BExpr,
-                                     const bool tookTrue,
-                                     BugReporterContext &BRC,
-                                     BugReport &R,
-                                     const ExplodedNode *N);
-  
-  PathDiagnosticPiece *VisitConditionVariable(StringRef LhsString,
-                                              const Expr *CondVarExpr,
-                                              const bool tookTrue,
-                                              BugReporterContext &BRC,
-                                              BugReport &R,
-                                              const ExplodedNode *N);
+  std::shared_ptr<PathDiagnosticPiece> VisitNodeImpl(const ExplodedNode *N,
+                                                     const ExplodedNode *Prev,
+                                                     BugReporterContext &BRC,
+                                                     BugReport &BR);
+
+  std::shared_ptr<PathDiagnosticPiece>
+  VisitTerminator(const Stmt *Term, const ExplodedNode *N,
+                  const CFGBlock *srcBlk, const CFGBlock *dstBlk, BugReport &R,
+                  BugReporterContext &BRC);
+
+  std::shared_ptr<PathDiagnosticPiece>
+  VisitTrueTest(const Expr *Cond, bool tookTrue, BugReporterContext &BRC,
+                BugReport &R, const ExplodedNode *N);
+
+  std::shared_ptr<PathDiagnosticPiece>
+  VisitTrueTest(const Expr *Cond, const DeclRefExpr *DR, const bool tookTrue,
+                BugReporterContext &BRC, BugReport &R, const ExplodedNode *N);
+
+  std::shared_ptr<PathDiagnosticPiece>
+  VisitTrueTest(const Expr *Cond, const BinaryOperator *BExpr,
+                const bool tookTrue, BugReporterContext &BRC, BugReport &R,
+                const ExplodedNode *N);
+
+  std::shared_ptr<PathDiagnosticPiece>
+  VisitConditionVariable(StringRef LhsString, const Expr *CondVarExpr,
+                         const bool tookTrue, BugReporterContext &BRC,
+                         BugReport &R, const ExplodedNode *N);
 
   bool patternMatch(const Expr *Ex,
                     const Expr *ParentEx,
@@ -270,10 +258,10 @@ public:
     ID.AddPointer(getTag());
   }
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                 const ExplodedNode *Prev,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override {
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                 const ExplodedNode *Prev,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override {
     return nullptr;
   }
 
@@ -302,10 +290,10 @@ public:
     ID.AddPointer(R);
   }
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                 const ExplodedNode *PrevN,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override;
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                 const ExplodedNode *PrevN,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override;
 };
 
 class SuppressInlineDefensiveChecksVisitor final
@@ -333,10 +321,10 @@ public:
   /// to make all PathDiagnosticPieces created by this visitor.
   static const char *getTag();
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *Succ,
-                                 const ExplodedNode *Pred,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override;
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *Succ,
+                                                 const ExplodedNode *Pred,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override;
 };
 
 class CXXSelfAssignmentBRVisitor final
@@ -349,10 +337,10 @@ public:
 
   void Profile(llvm::FoldingSetNodeID &ID) const override {}
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *Succ,
-                                 const ExplodedNode *Pred,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override;
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *Succ,
+                                                 const ExplodedNode *Pred,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override;
 };
 
 namespace bugreporter {
diff --git a/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h b/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h
index efe809fb1981..dc6e54a33206 100644
--- a/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h
+++ b/include/clang/StaticAnalyzer/Core/BugReporter/PathDiagnostic.h
@@ -334,7 +334,7 @@ public:
 // Path "pieces" for path-sensitive diagnostics.
 //===----------------------------------------------------------------------===//
 
-class PathDiagnosticPiece : public RefCountedBase<PathDiagnosticPiece> {
+class PathDiagnosticPiece {
 public:
   enum Kind { ControlFlow, Event, Macro, Call, Note };
   enum DisplayHint { Above, Below };
@@ -416,9 +416,8 @@ public:
 
   virtual void dump() const = 0;
 };
-  
-  
-class PathPieces : public std::list<IntrusiveRefCntPtr<PathDiagnosticPiece> > {
+
+class PathPieces : public std::list<std::shared_ptr<PathDiagnosticPiece>> {
   void flattenTo(PathPieces &Primary, PathPieces &Current,
                  bool ShouldFlattenMacros) const;
 public:
@@ -590,11 +589,11 @@ public:
   PathDiagnosticLocation getLocation() const override {
     return callEnter;
   }
-  
-  IntrusiveRefCntPtr<PathDiagnosticEventPiece> getCallEnterEvent() const;
-  IntrusiveRefCntPtr<PathDiagnosticEventPiece>
-    getCallEnterWithinCallerEvent() const;
-  IntrusiveRefCntPtr<PathDiagnosticEventPiece> getCallExitEvent() const;
+
+  std::shared_ptr<PathDiagnosticEventPiece> getCallEnterEvent() const;
+  std::shared_ptr<PathDiagnosticEventPiece>
+  getCallEnterWithinCallerEvent() const;
+  std::shared_ptr<PathDiagnosticEventPiece> getCallExitEvent() const;
 
   void flattenLocations() override {
     callEnter.flatten();
@@ -602,11 +601,11 @@ public:
     for (PathPieces::iterator I = path.begin(), 
          E = path.end(); I != E; ++I) (*I)->flattenLocations();
   }
-  
-  static PathDiagnosticCallPiece *construct(const ExplodedNode *N,
-                                            const CallExitEnd &CE,
-                                            const SourceManager &SM);
-  
+
+  static std::shared_ptr<PathDiagnosticCallPiece>
+  construct(const ExplodedNode *N, const CallExitEnd &CE,
+            const SourceManager &SM);
+
   static PathDiagnosticCallPiece *construct(PathPieces &pieces,
                                             const Decl *caller);
 
@@ -787,7 +786,7 @@ public:
     assert(!Loc.isValid() && "End location already set!");
     Loc = EndPiece->getLocation();
     assert(Loc.isValid() && "Invalid location for end-of-path piece");
-    getActivePath().push_back(EndPiece.release());
+    getActivePath().push_back(std::move(EndPiece));
   }
 
   void appendToDesc(StringRef S) {
diff --git a/include/clang/StaticAnalyzer/Core/CheckerManager.h b/include/clang/StaticAnalyzer/Core/CheckerManager.h
index 5af717d90268..0316c8fb173b 100644
--- a/include/clang/StaticAnalyzer/Core/CheckerManager.h
+++ b/include/clang/StaticAnalyzer/Core/CheckerManager.h
@@ -102,12 +102,12 @@ enum class ObjCMessageVisitKind {
 
 class CheckerManager {
   const LangOptions LangOpts;
-  AnalyzerOptionsRef AOptions;
+  AnalyzerOptions &AOptions;
   CheckName CurrentCheckName;
 
 public:
-  CheckerManager(const LangOptions &langOpts, AnalyzerOptionsRef AOptions)
-      : LangOpts(langOpts), AOptions(std::move(AOptions)) {}
+  CheckerManager(const LangOptions &langOpts, AnalyzerOptions &AOptions)
+      : LangOpts(langOpts), AOptions(AOptions) {}
 
   ~CheckerManager();
 
@@ -119,7 +119,7 @@ public:
   void finishedCheckerRegistration();
 
   const LangOptions &getLangOpts() const { return LangOpts; }
-  AnalyzerOptions &getAnalyzerOptions() { return *AOptions; }
+  AnalyzerOptions &getAnalyzerOptions() { return AOptions; }
 
   typedef CheckerBase *CheckerRef;
   typedef const void *CheckerTag;
diff --git a/include/clang/Tooling/Tooling.h b/include/clang/Tooling/Tooling.h
index ca232f409831..10e26ac25d17 100644
--- a/include/clang/Tooling/Tooling.h
+++ b/include/clang/Tooling/Tooling.h
@@ -69,7 +69,8 @@ public:
 
   /// \brief Perform an action for an invocation.
   virtual bool
-  runInvocation(clang::CompilerInvocation *Invocation, FileManager *Files,
+  runInvocation(std::shared_ptr<clang::CompilerInvocation> Invocation,
+                FileManager *Files,
                 std::shared_ptr<PCHContainerOperations> PCHContainerOps,
                 DiagnosticConsumer *DiagConsumer) = 0;
 };
@@ -85,7 +86,8 @@ public:
   ~FrontendActionFactory() override;
 
   /// \brief Invokes the compiler with a FrontendAction created by create().
-  bool runInvocation(clang::CompilerInvocation *Invocation, FileManager *Files,
+  bool runInvocation(std::shared_ptr<clang::CompilerInvocation> Invocation,
+                     FileManager *Files,
                      std::shared_ptr<PCHContainerOperations> PCHContainerOps,
                      DiagnosticConsumer *DiagConsumer) override;
 
@@ -261,7 +263,7 @@ public:
 
   bool runInvocation(const char *BinaryName,
                      clang::driver::Compilation *Compilation,
-                     clang::CompilerInvocation *Invocation,
+                     std::shared_ptr<clang::CompilerInvocation> Invocation,
                      std::shared_ptr<PCHContainerOperations> PCHContainerOps);
 
   std::vector<std::string> CommandLine;
diff --git a/lib/ARCMigrate/ARCMT.cpp b/lib/ARCMigrate/ARCMT.cpp
index 680aa3e48da4..cf7cddefc03d 100644
--- a/lib/ARCMigrate/ARCMT.cpp
+++ b/lib/ARCMigrate/ARCMT.cpp
@@ -271,7 +271,7 @@ bool arcmt::checkForManualIssues(
   Diags->setClient(&errRec, /*ShouldOwnClient=*/false);
 
   std::unique_ptr<ASTUnit> Unit(ASTUnit::LoadFromCompilerInvocationAction(
-      CInvok.release(), PCHContainerOps, Diags));
+      std::move(CInvok), PCHContainerOps, Diags));
   if (!Unit) {
     errRec.FinishCapture();
     return true;
@@ -547,7 +547,7 @@ bool MigrationProcess::applyTransform(TransformFn trans,
   ASTAction.reset(new ARCMTMacroTrackerAction(ARCMTMacroLocs));
 
   std::unique_ptr<ASTUnit> Unit(ASTUnit::LoadFromCompilerInvocationAction(
-      CInvok.release(), PCHContainerOps, Diags, ASTAction.get()));
+      std::move(CInvok), PCHContainerOps, Diags, ASTAction.get()));
   if (!Unit) {
     errRec.FinishCapture();
     return true;
diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp
index 1b5988d01988..d03c22af5b29 100644
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -1458,7 +1458,9 @@ CharUnits ASTContext::getDeclAlign(const Decl *D, bool ForAlignof) const {
         T = getPointerType(RT->getPointeeType());
     }
     QualType BaseT = getBaseElementType(T);
-    if (!BaseT->isIncompleteType() && !T->isFunctionType()) {
+    if (T->isFunctionType())
+      Align = getTypeInfoImpl(T.getTypePtr()).Align;
+    else if (!BaseT->isIncompleteType()) {
       // Adjust alignments of declarations with array type by the
       // large-array alignment on the target.
       if (const ArrayType *arrayType = getAsArrayType(T)) {
diff --git a/lib/ASTMatchers/Dynamic/VariantValue.cpp b/lib/ASTMatchers/Dynamic/VariantValue.cpp
index 8f3c70c1a8d8..f0339ed479cd 100644
--- a/lib/ASTMatchers/Dynamic/VariantValue.cpp
+++ b/lib/ASTMatchers/Dynamic/VariantValue.cpp
@@ -216,18 +216,20 @@ private:
 VariantMatcher::VariantMatcher() {}
 
 VariantMatcher VariantMatcher::SingleMatcher(const DynTypedMatcher &Matcher) {
-  return VariantMatcher(new SinglePayload(Matcher));
+  return VariantMatcher(std::make_shared<SinglePayload>(Matcher));
 }
 
 VariantMatcher
 VariantMatcher::PolymorphicMatcher(std::vector<DynTypedMatcher> Matchers) {
-  return VariantMatcher(new PolymorphicPayload(std::move(Matchers)));
+  return VariantMatcher(
+      std::make_shared<PolymorphicPayload>(std::move(Matchers)));
 }
 
 VariantMatcher VariantMatcher::VariadicOperatorMatcher(
     DynTypedMatcher::VariadicOperator Op,
     std::vector<VariantMatcher> Args) {
-  return VariantMatcher(new VariadicOpPayload(Op, std::move(Args)));
+  return VariantMatcher(
+      std::make_shared<VariadicOpPayload>(Op, std::move(Args)));
 }
 
 llvm::Optional<DynTypedMatcher> VariantMatcher::getSingleMatcher() const {
diff --git a/lib/Basic/Targets.cpp b/lib/Basic/Targets.cpp
index 85a83bca002b..4d2b3d007599 100644
--- a/lib/Basic/Targets.cpp
+++ b/lib/Basic/Targets.cpp
@@ -1751,30 +1751,57 @@ class NVPTXTargetInfo : public TargetInfo {
   static const char *const GCCRegNames[];
   static const Builtin::Info BuiltinInfo[];
   CudaArch GPU;
+  std::unique_ptr<TargetInfo> HostTarget;
 
 public:
-  NVPTXTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+  NVPTXTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts,
+                  unsigned TargetPointerWidth)
       : TargetInfo(Triple) {
+    assert((TargetPointerWidth == 32 || TargetPointerWidth == 64) &&
+           "NVPTX only supports 32- and 64-bit modes.");
+
     TLSSupported = false;
-    LongWidth = LongAlign = 64;
     AddrSpaceMap = &NVPTXAddrSpaceMap;
     UseAddrSpaceMapMangling = true;
+
     // Define available target features
     // These must be defined in sorted order!
     NoAsmVariants = true;
     GPU = CudaArch::SM_20;
 
+    if (TargetPointerWidth == 32)
+      resetDataLayout("e-p:32:32-i64:64-v16:16-v32:32-n16:32:64");
+    else
+      resetDataLayout("e-i64:64-v16:16-v32:32-n16:32:64");
+
     // If possible, get a TargetInfo for our host triple, so we can match its
     // types.
     llvm::Triple HostTriple(Opts.HostTriple);
-    if (HostTriple.isNVPTX())
-      return;
-    std::unique_ptr<TargetInfo> HostTarget(
-        AllocateTarget(llvm::Triple(Opts.HostTriple), Opts));
+    if (!HostTriple.isNVPTX())
+      HostTarget.reset(AllocateTarget(llvm::Triple(Opts.HostTriple), Opts));
+
+    // If no host target, make some guesses about the data layout and return.
     if (!HostTarget) {
+      LongWidth = LongAlign = TargetPointerWidth;
+      PointerWidth = PointerAlign = TargetPointerWidth;
+      switch (TargetPointerWidth) {
+      case 32:
+        SizeType = TargetInfo::UnsignedInt;
+        PtrDiffType = TargetInfo::SignedInt;
+        IntPtrType = TargetInfo::SignedInt;
+        break;
+      case 64:
+        SizeType = TargetInfo::UnsignedLong;
+        PtrDiffType = TargetInfo::SignedLong;
+        IntPtrType = TargetInfo::SignedLong;
+        break;
+      default:
+        llvm_unreachable("TargetPointerWidth must be 32 or 64");
+      }
       return;
     }
 
+    // Copy properties from host target.
     PointerWidth = HostTarget->getPointerWidth(/* AddrSpace = */ 0);
     PointerAlign = HostTarget->getPointerAlign(/* AddrSpace = */ 0);
     BoolWidth = HostTarget->getBoolWidth();
@@ -1935,6 +1962,16 @@ public:
     Opts.support("cl_khr_local_int32_base_atomics");
     Opts.support("cl_khr_local_int32_extended_atomics");
   }
+
+  CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
+    // CUDA compilations support all of the host's calling conventions.
+    //
+    // TODO: We should warn if you apply a non-default CC to anything other than
+    // a host function.
+    if (HostTarget)
+      return HostTarget->checkCallingConvention(CC);
+    return CCCR_Warning;
+  }
 };
 
 const Builtin::Info NVPTXTargetInfo::BuiltinInfo[] = {
@@ -1953,31 +1990,6 @@ ArrayRef<const char *> NVPTXTargetInfo::getGCCRegNames() const {
   return llvm::makeArrayRef(GCCRegNames);
 }
 
-class NVPTX32TargetInfo : public NVPTXTargetInfo {
-public:
-  NVPTX32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
-      : NVPTXTargetInfo(Triple, Opts) {
-    LongWidth = LongAlign = 32;
-    PointerWidth = PointerAlign = 32;
-    SizeType = TargetInfo::UnsignedInt;
-    PtrDiffType = TargetInfo::SignedInt;
-    IntPtrType = TargetInfo::SignedInt;
-    resetDataLayout("e-p:32:32-i64:64-v16:16-v32:32-n16:32:64");
-  }
-};
-
-class NVPTX64TargetInfo : public NVPTXTargetInfo {
-public:
-  NVPTX64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
-      : NVPTXTargetInfo(Triple, Opts) {
-    PointerWidth = PointerAlign = 64;
-    SizeType = TargetInfo::UnsignedLong;
-    PtrDiffType = TargetInfo::SignedLong;
-    IntPtrType = TargetInfo::SignedLong;
-    resetDataLayout("e-i64:64-v16:16-v32:32-n16:32:64");
-  }
-};
-
 static const unsigned AMDGPUAddrSpaceMap[] = {
   1,    // opencl_global
   3,    // opencl_local
@@ -8385,6 +8397,107 @@ public:
   }
 };
 
+
+// AVR Target
+class AVRTargetInfo : public TargetInfo {
+public:
+  AVRTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
+    TLSSupported = false;
+    PointerWidth = 16;
+    PointerAlign = 8;
+    IntWidth = 16;
+    IntAlign = 8;
+    LongWidth = 32;
+    LongAlign = 8;
+    LongLongWidth = 64;
+    LongLongAlign = 8;
+    SuitableAlign = 8;
+    DefaultAlignForAttributeAligned = 8;
+    HalfWidth = 16;
+    HalfAlign = 8;
+    FloatWidth = 32;
+    FloatAlign = 8;
+    DoubleWidth = 32;
+    DoubleAlign = 8;
+    DoubleFormat = &llvm::APFloat::IEEEsingle();
+    LongDoubleWidth = 32;
+    LongDoubleAlign = 8;
+    LongDoubleFormat = &llvm::APFloat::IEEEsingle();
+    SizeType = UnsignedInt;
+    PtrDiffType = SignedInt;
+    IntPtrType = SignedInt;
+    Char16Type = UnsignedInt;
+    WCharType = SignedInt;
+    WIntType = SignedInt;
+    Char32Type = UnsignedLong;
+    SigAtomicType = SignedChar;
+    resetDataLayout("e-p:16:16:16-i8:8:8-i16:16:16-i32:32:32-i64:64:64"
+		    "-f32:32:32-f64:64:64-n8");
+  }
+
+  void getTargetDefines(const LangOptions &Opts,
+                        MacroBuilder &Builder) const override {
+    Builder.defineMacro("__AVR__");
+  }
+
+  ArrayRef<Builtin::Info> getTargetBuiltins() const override {
+    return None;
+  }
+
+  BuiltinVaListKind getBuiltinVaListKind() const override {
+    return TargetInfo::VoidPtrBuiltinVaList;
+  }
+
+  const char *getClobbers() const override {
+    return "";
+  }
+
+  ArrayRef<const char *> getGCCRegNames() const override {
+    static const char * const GCCRegNames[] = {
+      "r0",   "r1",   "r2",   "r3",   "r4",   "r5",   "r6",   "r7",
+      "r8",   "r9",   "r10",  "r11",  "r12",  "r13",  "r14",  "r15",
+      "r16",  "r17",  "r18",  "r19",  "r20",  "r21",  "r22",  "r23",
+      "r24",  "r25",  "X",    "Y",    "Z",    "SP"
+    };
+    return llvm::makeArrayRef(GCCRegNames);
+  }
+
+  ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
+    return None;
+  }
+
+  ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override {
+    static const TargetInfo::AddlRegName AddlRegNames[] = {
+      { { "r26", "r27"}, 26 },
+      { { "r28", "r29"}, 27 },
+      { { "r30", "r31"}, 28 },
+      { { "SPL", "SPH"}, 29 },
+    };
+    return llvm::makeArrayRef(AddlRegNames);
+  }
+
+  bool validateAsmConstraint(const char *&Name,
+                             TargetInfo::ConstraintInfo &Info) const override {
+    return false;
+  }
+
+  IntType getIntTypeByWidth(unsigned BitWidth,
+                            bool IsSigned) const final {
+    // AVR prefers int for 16-bit integers.
+    return BitWidth == 16 ? (IsSigned ? SignedInt : UnsignedInt)
+                          : TargetInfo::getIntTypeByWidth(BitWidth, IsSigned);
+  }
+
+  IntType getLeastIntTypeByWidth(unsigned BitWidth,
+                                 bool IsSigned) const final {
+    // AVR uses int for int_least16_t and int_fast16_t.
+    return BitWidth == 16
+               ? (IsSigned ? SignedInt : UnsignedInt)
+               : TargetInfo::getLeastIntTypeByWidth(BitWidth, IsSigned);
+  }
+};
+
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -8507,6 +8620,8 @@ static TargetInfo *AllocateTarget(const llvm::Triple &Triple,
       return new ARMbeTargetInfo(Triple, Opts);
     }
 
+  case llvm::Triple::avr:
+    return new AVRTargetInfo(Triple, Opts);
   case llvm::Triple::bpfeb:
   case llvm::Triple::bpfel:
     return new BPFTargetInfo(Triple, Opts);
@@ -8632,9 +8747,9 @@ static TargetInfo *AllocateTarget(const llvm::Triple &Triple,
     }
 
   case llvm::Triple::nvptx:
-    return new NVPTX32TargetInfo(Triple, Opts);
+    return new NVPTXTargetInfo(Triple, Opts, /*TargetPointerWidth=*/32);
   case llvm::Triple::nvptx64:
-    return new NVPTX64TargetInfo(Triple, Opts);
+    return new NVPTXTargetInfo(Triple, Opts, /*TargetPointerWidth=*/64);
 
   case llvm::Triple::amdgcn:
   case llvm::Triple::r600:
diff --git a/lib/CodeGen/BackendUtil.cpp b/lib/CodeGen/BackendUtil.cpp
index 164e52d7de27..ed09f3a45566 100644
--- a/lib/CodeGen/BackendUtil.cpp
+++ b/lib/CodeGen/BackendUtil.cpp
@@ -14,6 +14,7 @@
 #include "clang/Frontend/CodeGenOptions.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/Utils.h"
+#include "clang/Lex/HeaderSearchOptions.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -32,6 +33,7 @@
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/LTO/LTOBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/ModuleSummaryIndexObjectFile.h"
 #include "llvm/Passes/PassBuilder.h"
@@ -61,6 +63,7 @@ namespace {
 
 class EmitAssemblyHelper {
   DiagnosticsEngine &Diags;
+  const HeaderSearchOptions &HSOpts;
   const CodeGenOptions &CodeGenOpts;
   const clang::TargetOptions &TargetOpts;
   const LangOptions &LangOpts;
@@ -100,11 +103,14 @@ private:
                      raw_pwrite_stream &OS);
 
 public:
-  EmitAssemblyHelper(DiagnosticsEngine &_Diags, const CodeGenOptions &CGOpts,
+  EmitAssemblyHelper(DiagnosticsEngine &_Diags,
+                     const HeaderSearchOptions &HeaderSearchOpts,
+                     const CodeGenOptions &CGOpts,
                      const clang::TargetOptions &TOpts,
                      const LangOptions &LOpts, Module *M)
-      : Diags(_Diags), CodeGenOpts(CGOpts), TargetOpts(TOpts), LangOpts(LOpts),
-        TheModule(M), CodeGenerationTime("codegen", "Code Generation Time") {}
+      : Diags(_Diags), HSOpts(HeaderSearchOpts), CodeGenOpts(CGOpts),
+        TargetOpts(TOpts), LangOpts(LOpts), TheModule(M),
+        CodeGenerationTime("codegen", "Code Generation Time") {}
 
   ~EmitAssemblyHelper() {
     if (CodeGenOpts.DisableFree)
@@ -584,12 +590,18 @@ void EmitAssemblyHelper::CreateTargetMachine(bool MustCreateTM) {
   Options.MCOptions.MCNoExecStack = CodeGenOpts.NoExecStack;
   Options.MCOptions.MCIncrementalLinkerCompatible =
       CodeGenOpts.IncrementalLinkerCompatible;
-  Options.MCOptions.MCPIECopyRelocations =
-      CodeGenOpts.PIECopyRelocations;
+  Options.MCOptions.MCPIECopyRelocations = CodeGenOpts.PIECopyRelocations;
   Options.MCOptions.MCFatalWarnings = CodeGenOpts.FatalWarnings;
   Options.MCOptions.AsmVerbose = CodeGenOpts.AsmVerbose;
   Options.MCOptions.PreserveAsmComments = CodeGenOpts.PreserveAsmComments;
   Options.MCOptions.ABIName = TargetOpts.ABI;
+  for (const auto &Entry : HSOpts.UserEntries)
+    if (!Entry.IsFramework &&
+        (Entry.Group == frontend::IncludeDirGroup::Quoted ||
+         Entry.Group == frontend::IncludeDirGroup::Angled ||
+         Entry.Group == frontend::IncludeDirGroup::System))
+      Options.MCOptions.IASSearchPaths.push_back(
+          Entry.IgnoreSysRoot ? Entry.Path : HSOpts.Sysroot + Entry.Path);
 
   TM.reset(TheTarget->createTargetMachine(Triple, TargetOpts.CPU, FeaturesStr,
                                           Options, RM, CM, OptLevel));
@@ -929,17 +941,19 @@ static void runThinLTOBackend(const CodeGenOptions &CGOpts, Module *M,
 }
 
 void clang::EmitBackendOutput(DiagnosticsEngine &Diags,
+                              const HeaderSearchOptions &HeaderOpts,
                               const CodeGenOptions &CGOpts,
                               const clang::TargetOptions &TOpts,
-                              const LangOptions &LOpts, const llvm::DataLayout &TDesc,
-                              Module *M, BackendAction Action,
+                              const LangOptions &LOpts,
+                              const llvm::DataLayout &TDesc, Module *M,
+                              BackendAction Action,
                               std::unique_ptr<raw_pwrite_stream> OS) {
   if (!CGOpts.ThinLTOIndexFile.empty()) {
     runThinLTOBackend(CGOpts, M, std::move(OS));
     return;
   }
 
-  EmitAssemblyHelper AsmHelper(Diags, CGOpts, TOpts, LOpts, M);
+  EmitAssemblyHelper AsmHelper(Diags, HeaderOpts, CGOpts, TOpts, LOpts, M);
 
   if (CGOpts.ExperimentalNewPassManager)
     AsmHelper.EmitAssemblyWithNewPassManager(Action, std::move(OS));
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index 43ca74761fbd..4d34b3e9222f 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -35,6 +35,11 @@ using namespace clang;
 using namespace CodeGen;
 using namespace llvm;
 
+static
+int64_t clamp(int64_t Value, int64_t Low, int64_t High) {
+  return std::min(High, std::max(Low, Value));
+}
+
 /// getBuiltinLibFunction - Given a builtin id for a function like
 /// "__builtin_fabsf", return a Function* for "fabsf".
 llvm::Constant *CodeGenModule::getBuiltinLibFunction(const FunctionDecl *FD,
@@ -8191,6 +8196,85 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     llvm_unreachable("Unknown FMA operation");
     return nullptr; // Suppress no-return warning
   }
+
+  case PPC::BI__builtin_vsx_insertword: {
+    llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxinsertw);
+
+    // Third argument is a compile time constant int. It must be clamped to
+    // to the range [0, 12].
+    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]);
+    assert(ArgCI &&
+           "Third arg to xxinsertw intrinsic must be constant integer");
+    const int64_t MaxIndex = 12;
+    int64_t Index = clamp(ArgCI->getSExtValue(), 0, MaxIndex);
+
+    // The builtin semantics don't exactly match the xxinsertw instructions
+    // semantics (which ppc_vsx_xxinsertw follows). The builtin extracts the
+    // word from the first argument, and inserts it in the second argument. The
+    // instruction extracts the word from its second input register and inserts
+    // it into its first input register, so swap the first and second arguments.
+    std::swap(Ops[0], Ops[1]);
+
+    // Need to cast the second argument from a vector of unsigned int to a
+    // vector of long long.
+    Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int64Ty, 2));
+
+    if (getTarget().isLittleEndian()) {
+      // Create a shuffle mask of (1, 0)
+      Constant *ShuffleElts[2] = { ConstantInt::get(Int32Ty, 1),
+                                   ConstantInt::get(Int32Ty, 0)
+                                 };
+      Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts);
+
+      // Reverse the double words in the vector we will extract from.
+      Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
+      Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ShuffleMask);
+
+      // Reverse the index.
+      Index = MaxIndex - Index;
+    }
+
+    // Intrinsic expects the first arg to be a vector of int.
+    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4));
+    Ops[2] = ConstantInt::getSigned(Int32Ty, Index);
+    return Builder.CreateCall(F, Ops);
+  }
+
+  case PPC::BI__builtin_vsx_extractuword: {
+    llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxextractuw);
+
+    // Intrinsic expects the first argument to be a vector of doublewords.
+    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
+
+    // The second argument is a compile time constant int that needs to
+    // be clamped to the range [0, 12].
+    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[1]);
+    assert(ArgCI &&
+           "Second Arg to xxextractuw intrinsic must be a constant integer!");
+    const int64_t MaxIndex = 12;
+    int64_t Index = clamp(ArgCI->getSExtValue(), 0, MaxIndex);
+
+    if (getTarget().isLittleEndian()) {
+      // Reverse the index.
+      Index = MaxIndex - Index;
+      Ops[1] = ConstantInt::getSigned(Int32Ty, Index);
+
+      // Emit the call, then reverse the double words of the results vector.
+      Value *Call = Builder.CreateCall(F, Ops);
+
+      // Create a shuffle mask of (1, 0)
+      Constant *ShuffleElts[2] = { ConstantInt::get(Int32Ty, 1),
+                                   ConstantInt::get(Int32Ty, 0)
+                                 };
+      Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts);
+
+      Value *ShuffleCall = Builder.CreateShuffleVector(Call, Call, ShuffleMask);
+      return ShuffleCall;
+    } else {
+      Ops[1] = ConstantInt::getSigned(Int32Ty, Index);
+      return Builder.CreateCall(F, Ops);
+    }
+  }
   }
 }
 
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index 9b96a59aec38..c7c61e0c8ecb 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -393,15 +393,13 @@ CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) {
 
   // When declaring a function without a prototype, always use a
   // non-variadic type.
-  if (isa<FunctionNoProtoType>(FTy)) {
-    CanQual<FunctionNoProtoType> noProto = FTy.getAs<FunctionNoProtoType>();
+  if (CanQual<FunctionNoProtoType> noProto = FTy.getAs<FunctionNoProtoType>()) {
     return arrangeLLVMFunctionInfo(
         noProto->getReturnType(), /*instanceMethod=*/false,
         /*chainCall=*/false, None, noProto->getExtInfo(), {},RequiredArgs::All);
   }
 
-  assert(isa<FunctionProtoType>(FTy));
-  return arrangeFreeFunctionType(FTy.getAs<FunctionProtoType>(), FD);
+  return arrangeFreeFunctionType(FTy.castAs<FunctionProtoType>(), FD);
 }
 
 /// Arrange the argument and result information for the declaration or
diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp
index 183201c78e36..e5e34a5f3ed6 100644
--- a/lib/CodeGen/CGExpr.cpp
+++ b/lib/CodeGen/CGExpr.cpp
@@ -604,12 +604,13 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc,
   }
 
   if (Checks.size() > 0) {
+    // Make sure we're not losing information. Alignment needs to be a power of
+    // 2
+    assert(!AlignVal || (uint64_t)1 << llvm::Log2_64(AlignVal) == AlignVal);
     llvm::Constant *StaticData[] = {
-     EmitCheckSourceLocation(Loc),
-      EmitCheckTypeDescriptor(Ty),
-      llvm::ConstantInt::get(SizeTy, AlignVal),
-      llvm::ConstantInt::get(Int8Ty, TCK)
-    };
+        EmitCheckSourceLocation(Loc), EmitCheckTypeDescriptor(Ty),
+        llvm::ConstantInt::get(Int8Ty, AlignVal ? llvm::Log2_64(AlignVal) : 1),
+        llvm::ConstantInt::get(Int8Ty, TCK)};
     EmitCheck(Checks, SanitizerHandler::TypeMismatch, StaticData, Ptr);
   }
 
diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp
index 0624d86b564a..27af344fae87 100644
--- a/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2701,14 +2701,16 @@ void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
                                              "only required for the device "
                                              "code generation.");
   OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] =
-      OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr);
+      OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
+                                   /*Flags=*/0);
   ++OffloadingEntriesNum;
 }
 
 void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
     registerTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
                                   StringRef ParentName, unsigned LineNum,
-                                  llvm::Constant *Addr, llvm::Constant *ID) {
+                                  llvm::Constant *Addr, llvm::Constant *ID,
+                                  int32_t Flags) {
   // If we are emitting code for a target, the entry is already initialized,
   // only has to be registered.
   if (CGM.getLangOpts().OpenMPIsDevice) {
@@ -2719,9 +2721,10 @@ void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
     assert(Entry.isValid() && "Entry not initialized!");
     Entry.setAddress(Addr);
     Entry.setID(ID);
+    Entry.setFlags(Flags);
     return;
   } else {
-    OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum++, Addr, ID);
+    OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum++, Addr, ID, Flags);
     OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] = Entry;
   }
 }
@@ -2888,7 +2891,8 @@ CGOpenMPRuntime::createOffloadingBinaryDescriptorRegistration() {
 }
 
 void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *ID,
-                                         llvm::Constant *Addr, uint64_t Size) {
+                                         llvm::Constant *Addr, uint64_t Size,
+                                         int32_t Flags) {
   StringRef Name = Addr->getName();
   auto *TgtOffloadEntryType = cast<llvm::StructType>(
       CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy()));
@@ -2918,6 +2922,8 @@ void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *ID,
   EntryInit.add(AddrPtr);
   EntryInit.add(StrPtr);
   EntryInit.addInt(CGM.SizeTy, Size);
+  EntryInit.addInt(CGM.Int32Ty, Flags);
+  EntryInit.addInt(CGM.Int32Ty, 0);
   llvm::GlobalVariable *Entry =
     EntryInit.finishAndCreateGlobal(".omp_offloading.entry",
                                     Align,
@@ -3090,6 +3096,8 @@ QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() {
   //                          // (function or global)
   //   char      *name;       // Name of the function or global.
   //   size_t     size;       // Size of the entry info (0 if it a function).
+  //   int32_t    flags;      // Flags associated with the entry, e.g. 'link'.
+  //   int32_t    reserved;   // Reserved, to use by the runtime library.
   // };
   if (TgtOffloadEntryQTy.isNull()) {
     ASTContext &C = CGM.getContext();
@@ -3098,6 +3106,10 @@ QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() {
     addFieldToRecordDecl(C, RD, C.VoidPtrTy);
     addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy));
     addFieldToRecordDecl(C, RD, C.getSizeType());
+    addFieldToRecordDecl(
+        C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
+    addFieldToRecordDecl(
+        C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
     RD->completeDefinition();
     TgtOffloadEntryQTy = C.getRecordType(RD);
   }
@@ -4852,7 +4864,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
 
   // Register the information for the entry associated with this target region.
   OffloadEntriesInfoManager.registerTargetRegionEntryInfo(
-      DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID);
+      DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID,
+      /*Flags=*/0);
 }
 
 /// discard all CompoundStmts intervening between two constructs
diff --git a/lib/CodeGen/CGOpenMPRuntime.h b/lib/CodeGen/CGOpenMPRuntime.h
index 9057e5ec4c14..9a784dff0ae8 100644
--- a/lib/CodeGen/CGOpenMPRuntime.h
+++ b/lib/CodeGen/CGOpenMPRuntime.h
@@ -110,9 +110,9 @@ protected:
   CodeGenModule &CGM;
 
   /// \brief Creates offloading entry for the provided entry ID \a ID,
-  /// address \a Addr and size \a Size.
+  /// address \a Addr, size \a Size, and flags \a Flags.
   virtual void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr,
-                                  uint64_t Size);
+                                  uint64_t Size, int32_t Flags = 0);
 
   /// \brief Helper to emit outlined function for 'target' directive.
   /// \param D Directive to emit.
@@ -245,10 +245,10 @@ private:
     unsigned OffloadingEntriesNum;
 
   public:
-    /// \brief Base class of the entries info.
+    /// Base class of the entries info.
     class OffloadEntryInfo {
     public:
-      /// \brief Kind of a given entry. Currently, only target regions are
+      /// Kind of a given entry. Currently, only target regions are
       /// supported.
       enum OffloadingEntryInfoKinds : unsigned {
         // Entry is a target region.
@@ -257,17 +257,24 @@ private:
         OFFLOAD_ENTRY_INFO_INVALID = ~0u
       };
 
-      OffloadEntryInfo() : Order(~0u), Kind(OFFLOAD_ENTRY_INFO_INVALID) {}
-      explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order)
-          : Order(Order), Kind(Kind) {}
+      OffloadEntryInfo()
+          : Flags(0), Order(~0u), Kind(OFFLOAD_ENTRY_INFO_INVALID) {}
+      explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
+                                int32_t Flags)
+          : Flags(Flags), Order(Order), Kind(Kind) {}
 
       bool isValid() const { return Order != ~0u; }
       unsigned getOrder() const { return Order; }
       OffloadingEntryInfoKinds getKind() const { return Kind; }
+      int32_t getFlags() const { return Flags; }
+      void setFlags(int32_t NewFlags) { Flags = NewFlags; }
       static bool classof(const OffloadEntryInfo *Info) { return true; }
 
-    protected:
-      // \brief Order this entry was emitted.
+    private:
+      /// Flags associated with the device global.
+      int32_t Flags;
+
+      /// Order this entry was emitted.
       unsigned Order;
 
       OffloadingEntryInfoKinds Kind;
@@ -292,12 +299,13 @@ private:
 
     public:
       OffloadEntryInfoTargetRegion()
-          : OffloadEntryInfo(OFFLOAD_ENTRY_INFO_TARGET_REGION, ~0u),
+          : OffloadEntryInfo(OFFLOAD_ENTRY_INFO_TARGET_REGION, ~0u,
+                             /*Flags=*/0),
             Addr(nullptr), ID(nullptr) {}
       explicit OffloadEntryInfoTargetRegion(unsigned Order,
                                             llvm::Constant *Addr,
-                                            llvm::Constant *ID)
-          : OffloadEntryInfo(OFFLOAD_ENTRY_INFO_TARGET_REGION, Order),
+                                            llvm::Constant *ID, int32_t Flags)
+          : OffloadEntryInfo(OFFLOAD_ENTRY_INFO_TARGET_REGION, Order, Flags),
             Addr(Addr), ID(ID) {}
 
       llvm::Constant *getAddress() const { return Addr; }
@@ -321,8 +329,8 @@ private:
     /// \brief Register target region entry.
     void registerTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
                                        StringRef ParentName, unsigned LineNum,
-                                       llvm::Constant *Addr,
-                                       llvm::Constant *ID);
+                                       llvm::Constant *Addr, llvm::Constant *ID,
+                                       int32_t Flags);
     /// \brief Return true if a target region entry with the provided
     /// information exists.
     bool hasTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index fe0e2acdfdbf..bc1458b1c203 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -22,14 +22,10 @@ using namespace CodeGen;
 
 namespace {
 enum OpenMPRTLFunctionNVPTX {
-  /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle,
-  /// kmp_int32 thread_limit);
+  /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit);
   OMPRTL_NVPTX__kmpc_kernel_init,
-};
-
-// NVPTX Address space
-enum AddressSpace {
-  AddressSpaceShared = 3,
+  /// \brief Call to void __kmpc_kernel_deinit();
+  OMPRTL_NVPTX__kmpc_kernel_deinit,
 };
 } // namespace
 
@@ -70,6 +66,15 @@ static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
 /// Synchronize all GPU threads in a block.
 static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
 
+/// Get the value of the thread_limit clause in the teams directive.
+/// The runtime encodes thread_limit in the launch parameter, always starting
+/// thread_limit+warpSize threads per team.
+static llvm::Value *getThreadLimit(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
+                       "thread_limit");
+}
+
 /// Get the thread id of the OMP master thread.
 /// The master thread id is the first thread (lane) of the last warp in the
 /// GPU block.  Warp size is assumed to be some power of 2.
@@ -103,35 +108,105 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
       CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
       /* placeholder */ "_worker", &CGM.getModule());
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
-  WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage);
-  WorkerFn->addFnAttr(llvm::Attribute::NoInline);
 }
 
-void CGOpenMPRuntimeNVPTX::initializeEnvironment() {
-  //
-  // Initialize master-worker control state in shared memory.
-  //
+void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
+                                             StringRef ParentName,
+                                             llvm::Function *&OutlinedFn,
+                                             llvm::Constant *&OutlinedFnID,
+                                             bool IsOffloadEntry,
+                                             const RegionCodeGenTy &CodeGen) {
+  EntryFunctionState EST;
+  WorkerFunctionState WST(CGM);
+
+  // Emit target region as a standalone region.
+  class NVPTXPrePostActionTy : public PrePostActionTy {
+    CGOpenMPRuntimeNVPTX &RT;
+    CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
+    CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
+
+  public:
+    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
+                         CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
+                         CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
+        : RT(RT), EST(EST), WST(WST) {}
+    void Enter(CodeGenFunction &CGF) override {
+      RT.emitGenericEntryHeader(CGF, EST, WST);
+    }
+    void Exit(CodeGenFunction &CGF) override {
+      RT.emitGenericEntryFooter(CGF, EST);
+    }
+  } Action(*this, EST, WST);
+  CodeGen.setAction(Action);
+  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
+                                   IsOffloadEntry, CodeGen);
 
-  auto DL = CGM.getDataLayout();
-  ActiveWorkers = new llvm::GlobalVariable(
-      CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false,
-      llvm::GlobalValue::CommonLinkage,
-      llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0,
-      llvm::GlobalVariable::NotThreadLocal, AddressSpaceShared);
-  ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty));
-
-  WorkID = new llvm::GlobalVariable(
-      CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false,
-      llvm::GlobalValue::CommonLinkage,
-      llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0,
-      llvm::GlobalVariable::NotThreadLocal, AddressSpaceShared);
-  WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty));
+  // Create the worker function
+  emitWorkerFunction(WST);
+
+  // Now change the name of the worker function to correspond to this target
+  // region's entry function.
+  WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
+}
+
+// Setup NVPTX threads for master-worker OpenMP scheme.
+void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
+                                                  EntryFunctionState &EST,
+                                                  WorkerFunctionState &WST) {
+  CGBuilderTy &Bld = CGF.Builder;
+
+  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
+  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
+  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
+  EST.ExitBB = CGF.createBasicBlock(".exit");
+
+  auto *IsWorker =
+      Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
+  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
+
+  CGF.EmitBlock(WorkerBB);
+  CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
+  CGF.EmitBranch(EST.ExitBB);
+
+  CGF.EmitBlock(MasterCheckBB);
+  auto *IsMaster =
+      Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
+  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
+
+  CGF.EmitBlock(MasterBB);
+  // First action in sequential region:
+  // Initialize the state of the OpenMP runtime library on the GPU.
+  llvm::Value *Args[] = {getThreadLimit(CGF)};
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
+}
+
+void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
+                                                  EntryFunctionState &EST) {
+  if (!EST.ExitBB)
+    EST.ExitBB = CGF.createBasicBlock(".exit");
+
+  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
+  CGF.EmitBranch(TerminateBB);
+
+  CGF.EmitBlock(TerminateBB);
+  // Signal termination condition.
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None);
+  // Barrier to terminate worker threads.
+  syncCTAThreads(CGF);
+  // Master thread jumps to exit point.
+  CGF.EmitBranch(EST.ExitBB);
+
+  CGF.EmitBlock(EST.ExitBB);
+  EST.ExitBB = nullptr;
 }
 
 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
   auto &Ctx = CGM.getContext();
 
   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
+  CGF.disableDebugInfo();
   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
   emitWorkerLoop(CGF, WST);
   CGF.FinishFunction();
@@ -163,21 +238,26 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
   CGF.EmitBlock(AwaitBB);
   // Wait for parallel work
   syncCTAThreads(CGF);
+
+  Address WorkFn =
+      CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
+  Address ExecStatus =
+      CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
+  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
+  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
+
+  // TODO: Call into runtime to get parallel work.
+
   // On termination condition (workid == 0), exit loop.
-  llvm::Value *ShouldTerminate = Bld.CreateICmpEQ(
-      Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()),
-      llvm::Constant::getNullValue(WorkID->getType()->getElementType()),
-      "should_terminate");
+  llvm::Value *ShouldTerminate =
+      Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
   Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
 
   // Activate requested workers.
   CGF.EmitBlock(SelectWorkersBB);
-  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
-  llvm::Value *ActiveThread = Bld.CreateICmpSLT(
-      ThreadID,
-      Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()),
-      "active_thread");
-  Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB);
+  llvm::Value *IsActive =
+      Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
+  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
 
   // Signal start of parallel region.
   CGF.EmitBlock(ExecuteBB);
@@ -197,72 +277,6 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
   CGF.EmitBlock(ExitBB);
 }
 
-// Setup NVPTX threads for master-worker OpenMP scheme.
-void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF,
-                                           EntryFunctionState &EST,
-                                           WorkerFunctionState &WST) {
-  CGBuilderTy &Bld = CGF.Builder;
-
-  // Get the master thread id.
-  llvm::Value *MasterID = getMasterThreadID(CGF);
-  // Current thread's identifier.
-  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
-
-  // Setup BBs in entry function.
-  llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker");
-  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
-  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
-  EST.ExitBB = CGF.createBasicBlock(".exit");
-
-  // The head (master thread) marches on while its body of companion threads in
-  // the warp go to sleep.
-  llvm::Value *ShouldDie =
-      Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp");
-  Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB);
-
-  // Select worker threads...
-  CGF.EmitBlock(WorkerCheckBB);
-  llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker");
-  Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB);
-
-  // ... and send to worker loop, awaiting parallel invocation.
-  CGF.EmitBlock(WorkerBB);
-  CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
-  CGF.EmitBranch(EST.ExitBB);
-
-  // Only master thread executes subsequent serial code.
-  CGF.EmitBlock(MasterBB);
-
-  // First action in sequential region:
-  // Initialize the state of the OpenMP runtime library on the GPU.
-  llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)};
-  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init),
-                      Args);
-}
-
-void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF,
-                                           EntryFunctionState &EST) {
-  if (!EST.ExitBB)
-    EST.ExitBB = CGF.createBasicBlock(".exit");
-
-  CGBuilderTy &Bld = CGF.Builder;
-  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
-  CGF.EmitBranch(TerminateBB);
-
-  CGF.EmitBlock(TerminateBB);
-  // Signal termination condition.
-  Bld.CreateAlignedStore(
-      llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID,
-      WorkID->getAlignment());
-  // Barrier to terminate worker threads.
-  syncCTAThreads(CGF);
-  // Master thread jumps to exit point.
-  CGF.EmitBranch(EST.ExitBB);
-
-  CGF.EmitBlock(EST.ExitBB);
-  EST.ExitBB = nullptr;
-}
-
 /// \brief Returns specified OpenMP runtime function for the current OpenMP
 /// implementation.  Specialized for the NVPTX device.
 /// \param Function OpenMP runtime function.
@@ -272,21 +286,27 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
   llvm::Constant *RTLFn = nullptr;
   switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
   case OMPRTL_NVPTX__kmpc_kernel_init: {
-    // Build void __kmpc_kernel_init(kmp_int32 omp_handle,
-    // kmp_int32 thread_limit);
-    llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty};
+    // Build void __kmpc_kernel_init(kmp_int32 thread_limit);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty};
     llvm::FunctionType *FnTy =
         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
     break;
   }
+  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
+    // Build void __kmpc_kernel_deinit();
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, {}, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
+    break;
+  }
   }
   return RTLFn;
 }
 
 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
                                               llvm::Constant *Addr,
-                                              uint64_t Size) {
+                                              uint64_t Size, int32_t) {
   auto *F = dyn_cast<llvm::Function>(Addr);
   // TODO: Add support for global variables on the device after declare target
   // support.
@@ -315,44 +335,14 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
 
   assert(!ParentName.empty() && "Invalid target region parent name!");
 
-  EntryFunctionState EST;
-  WorkerFunctionState WST(CGM);
-
-  // Emit target region as a standalone region.
-  class NVPTXPrePostActionTy : public PrePostActionTy {
-    CGOpenMPRuntimeNVPTX &RT;
-    CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
-    CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
-
-  public:
-    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
-                         CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
-                         CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
-        : RT(RT), EST(EST), WST(WST) {}
-    void Enter(CodeGenFunction &CGF) override {
-      RT.emitEntryHeader(CGF, EST, WST);
-    }
-    void Exit(CodeGenFunction &CGF) override { RT.emitEntryFooter(CGF, EST); }
-  } Action(*this, EST, WST);
-  CodeGen.setAction(Action);
-  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
-                                   IsOffloadEntry, CodeGen);
-
-  // Create the worker function
-  emitWorkerFunction(WST);
-
-  // Now change the name of the worker function to correspond to this target
-  // region's entry function.
-  WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
+  emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    CodeGen);
 }
 
 CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
-    : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) {
+    : CGOpenMPRuntime(CGM) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP NVPTX can only handle device code.");
-
-  // Called once per module during initialization.
-  initializeEnvironment();
 }
 
 void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index a33fb27579f6..63a02965a5bd 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -24,7 +24,7 @@ namespace clang {
 namespace CodeGen {
 
 class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
-public:
+private:
   struct EntryFunctionState {
     llvm::BasicBlock *ExitBB = nullptr;
   };
@@ -40,34 +40,21 @@ public:
     void createWorkerFunction(CodeGenModule &CGM);
   };
 
-  /// \brief Helper for target entry function. Guide the master and worker
-  /// threads to their respective locations.
-  void emitEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
-                       WorkerFunctionState &WST);
-
-  /// \brief Signal termination of OMP execution.
-  void emitEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
-
-private:
-  //
-  // Private state and methods.
-  //
-
-  // Master-worker control state.
-  // Number of requested OMP threads in parallel region.
-  llvm::GlobalVariable *ActiveWorkers;
-  // Outlined function for the workers to execute.
-  llvm::GlobalVariable *WorkID;
-
-  /// \brief Initialize master-worker control state.
-  void initializeEnvironment();
-
   /// \brief Emit the worker function for the current target region.
   void emitWorkerFunction(WorkerFunctionState &WST);
 
   /// \brief Helper for worker function. Emit body of worker loop.
   void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST);
 
+  /// \brief Helper for generic target entry function. Guide the master and
+  /// worker threads to their respective locations.
+  void emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
+                              WorkerFunctionState &WST);
+
+  /// \brief Signal termination of OMP execution for generic target entry
+  /// function.
+  void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
+
   /// \brief Returns specified OpenMP runtime function for the current OpenMP
   /// implementation.  Specialized for the NVPTX device.
   /// \param Function OpenMP runtime function.
@@ -79,9 +66,23 @@ private:
   //
 
   /// \brief Creates offloading entry for the provided entry ID \a ID,
-  /// address \a Addr and size \a Size.
+  /// address \a Addr, size \a Size, and flags \a Flags.
   void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr,
-                          uint64_t Size) override;
+                          uint64_t Size, int32_t Flags = 0) override;
+
+  /// \brief Emit outlined function specialized for the Fork-Join
+  /// programming model for applicable target directives on the NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                         llvm::Function *&OutlinedFn,
+                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                         const RegionCodeGenTy &CodeGen);
 
   /// \brief Emit outlined function for 'target' directive on the NVPTX
   /// device.
diff --git a/lib/CodeGen/CodeGenAction.cpp b/lib/CodeGen/CodeGenAction.cpp
index 1e17918df4a4..5f74141d75b3 100644
--- a/lib/CodeGen/CodeGenAction.cpp
+++ b/lib/CodeGen/CodeGenAction.cpp
@@ -44,6 +44,7 @@ namespace clang {
     virtual void anchor();
     DiagnosticsEngine &Diags;
     BackendAction Action;
+    const HeaderSearchOptions &HeaderSearchOpts;
     const CodeGenOptions &CodeGenOpts;
     const TargetOptions &TargetOpts;
     const LangOptions &LangOpts;
@@ -77,8 +78,8 @@ namespace clang {
         const SmallVectorImpl<std::pair<unsigned, llvm::Module *>> &LinkModules,
         std::unique_ptr<raw_pwrite_stream> OS, LLVMContext &C,
         CoverageSourceInfo *CoverageInfo = nullptr)
-        : Diags(Diags), Action(Action), CodeGenOpts(CodeGenOpts),
-          TargetOpts(TargetOpts), LangOpts(LangOpts),
+        : Diags(Diags), Action(Action), HeaderSearchOpts(HeaderSearchOpts),
+          CodeGenOpts(CodeGenOpts), TargetOpts(TargetOpts), LangOpts(LangOpts),
           AsmOutStream(std::move(OS)), Context(nullptr),
           LLVMIRGeneration("irgen", "LLVM IR Generation Time"),
           LLVMIRGenerationRefCount(0),
@@ -225,8 +226,8 @@ namespace clang {
 
       EmbedBitcode(getModule(), CodeGenOpts, llvm::MemoryBufferRef());
 
-      EmitBackendOutput(Diags, CodeGenOpts, TargetOpts, LangOpts,
-                        C.getTargetInfo().getDataLayout(),
+      EmitBackendOutput(Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts,
+                        LangOpts, C.getTargetInfo().getDataLayout(),
                         getModule(), Action, std::move(AsmOutStream));
 
       Ctx.setInlineAsmDiagnosticHandler(OldHandler, OldContext);
@@ -898,9 +899,10 @@ void CodeGenAction::ExecuteAction() {
     Ctx.setInlineAsmDiagnosticHandler(BitcodeInlineAsmDiagHandler,
                                       &CI.getDiagnostics());
 
-    EmitBackendOutput(CI.getDiagnostics(), CI.getCodeGenOpts(), TargetOpts,
-                      CI.getLangOpts(), CI.getTarget().getDataLayout(),
-                      TheModule.get(), BA, std::move(OS));
+    EmitBackendOutput(CI.getDiagnostics(), CI.getHeaderSearchOpts(),
+                      CI.getCodeGenOpts(), TargetOpts, CI.getLangOpts(),
+                      CI.getTarget().getDataLayout(), TheModule.get(), BA,
+                      std::move(OS));
     return;
   }
 
diff --git a/lib/CodeGen/CodeGenFunction.h b/lib/CodeGen/CodeGenFunction.h
index 1347f54df9ac..05522cd40024 100644
--- a/lib/CodeGen/CodeGenFunction.h
+++ b/lib/CodeGen/CodeGenFunction.h
@@ -120,7 +120,7 @@ enum TypeEvaluationKind {
   SANITIZER_CHECK(OutOfBounds, out_of_bounds, 0)                               \
   SANITIZER_CHECK(ShiftOutOfBounds, shift_out_of_bounds, 0)                    \
   SANITIZER_CHECK(SubOverflow, sub_overflow, 0)                                \
-  SANITIZER_CHECK(TypeMismatch, type_mismatch, 0)                              \
+  SANITIZER_CHECK(TypeMismatch, type_mismatch, 1)                              \
   SANITIZER_CHECK(VLABoundNotPositive, vla_bound_not_positive, 0)
 
 enum SanitizerHandler {
diff --git a/lib/CodeGen/ObjectFilePCHContainerOperations.cpp b/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
index baf7811eedaf..754f9968b67f 100644
--- a/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
+++ b/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
@@ -282,7 +282,7 @@ public:
       // Print the IR for the PCH container to the debug output.
       llvm::SmallString<0> Buffer;
       clang::EmitBackendOutput(
-          Diags, CodeGenOpts, TargetOpts, LangOpts,
+          Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts, LangOpts,
           Ctx.getTargetInfo().getDataLayout(), M.get(),
           BackendAction::Backend_EmitLL,
           llvm::make_unique<llvm::raw_svector_ostream>(Buffer));
@@ -290,9 +290,10 @@ public:
     });
 
     // Use the LLVM backend to emit the pch container.
-    clang::EmitBackendOutput(Diags, CodeGenOpts, TargetOpts, LangOpts,
-                             Ctx.getTargetInfo().getDataLayout(), M.get(),
-                             BackendAction::Backend_EmitObj, std::move(OS));
+    clang::EmitBackendOutput(Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts,
+                             LangOpts, Ctx.getTargetInfo().getDataLayout(),
+                             M.get(), BackendAction::Backend_EmitObj,
+                             std::move(OS));
 
     // Free the memory for the temporary buffer.
     llvm::SmallVector<char, 0> Empty;
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index 391eb53d2500..d2fc3888ef29 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -871,6 +871,14 @@ static bool isX86VectorCallAggregateSmallEnough(uint64_t NumMembers) {
   return NumMembers <= 4;
 }
 
+/// Returns a Homogeneous Vector Aggregate ABIArgInfo, used in X86.
+static ABIArgInfo getDirectX86Hva(llvm::Type* T = nullptr) {
+  auto AI = ABIArgInfo::getDirect(T);
+  AI.setInReg(true);
+  AI.setCanBeFlattened(false);
+  return AI;
+}
+
 //===----------------------------------------------------------------------===//
 // X86-32 ABI Implementation
 //===----------------------------------------------------------------------===//
@@ -884,6 +892,11 @@ struct CCState {
   unsigned FreeSSERegs;
 };
 
+enum {
+  // Vectorcall only allows the first 6 parameters to be passed in registers.
+  VectorcallMaxParamNumAsReg = 6
+};
+
 /// X86_32ABIInfo - The X86-32 ABI information.
 class X86_32ABIInfo : public SwiftABIInfo {
   enum Class {
@@ -929,6 +942,8 @@ class X86_32ABIInfo : public SwiftABIInfo {
   Class classify(QualType Ty) const;
   ABIArgInfo classifyReturnType(QualType RetTy, CCState &State) const;
   ABIArgInfo classifyArgumentType(QualType RetTy, CCState &State) const;
+  ABIArgInfo reclassifyHvaArgType(QualType RetTy, CCState &State, 
+                                  const ABIArgInfo& current) const;
   /// \brief Updates the number of available free registers, returns 
   /// true if any registers were allocated.
   bool updateFreeRegs(QualType Ty, CCState &State) const;
@@ -946,6 +961,8 @@ class X86_32ABIInfo : public SwiftABIInfo {
   void addFieldToArgStruct(SmallVector<llvm::Type *, 6> &FrameFields,
                            CharUnits &StackOffset, ABIArgInfo &Info,
                            QualType Type) const;
+  void computeVectorCallArgs(CGFunctionInfo &FI, CCState &State,
+                             bool &UsedInAlloca) const;
 
 public:
 
@@ -1494,6 +1511,27 @@ bool X86_32ABIInfo::shouldPrimitiveUseInReg(QualType Ty, CCState &State) const {
   return true;
 }
 
+ABIArgInfo
+X86_32ABIInfo::reclassifyHvaArgType(QualType Ty, CCState &State,
+                                    const ABIArgInfo &current) const {
+  // Assumes vectorCall calling convention.
+  const Type *Base = nullptr;
+  uint64_t NumElts = 0;
+
+  if (!Ty->isBuiltinType() && !Ty->isVectorType() &&
+      isHomogeneousAggregate(Ty, Base, NumElts)) {
+    if (State.FreeSSERegs >= NumElts) {
+      // HVA types get passed directly in registers if there is room.
+      State.FreeSSERegs -= NumElts;
+      return getDirectX86Hva();
+    }
+    // If there's no room, the HVA gets passed as normal indirect
+    // structure.
+    return getIndirectResult(Ty, /*ByVal=*/false, State);
+  } 
+  return current;
+}
+
 ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
                                                CCState &State) const {
   // FIXME: Set alignment on indirect arguments.
@@ -1513,19 +1551,34 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
   }
 
   // vectorcall adds the concept of a homogenous vector aggregate, similar
-  // to other targets.
+  // to other targets, regcall uses some of the HVA rules.
   const Type *Base = nullptr;
   uint64_t NumElts = 0;
   if ((State.CC == llvm::CallingConv::X86_VectorCall ||
        State.CC == llvm::CallingConv::X86_RegCall) &&
       isHomogeneousAggregate(Ty, Base, NumElts)) {
-    if (State.FreeSSERegs >= NumElts) {
-      State.FreeSSERegs -= NumElts;
-      if (Ty->isBuiltinType() || Ty->isVectorType())
+
+    if (State.CC == llvm::CallingConv::X86_RegCall) {
+      if (State.FreeSSERegs >= NumElts) {
+        State.FreeSSERegs -= NumElts;
+        if (Ty->isBuiltinType() || Ty->isVectorType())
+          return ABIArgInfo::getDirect();
+        return ABIArgInfo::getExpand();
+
+      }
+      return getIndirectResult(Ty, /*ByVal=*/false, State);
+    } else if (State.CC == llvm::CallingConv::X86_VectorCall) {
+      if (State.FreeSSERegs >= NumElts && (Ty->isBuiltinType() || Ty->isVectorType())) {
+        // Actual floating-point types get registers first time through if
+        // there is registers available
+        State.FreeSSERegs -= NumElts;
         return ABIArgInfo::getDirect();
-      return ABIArgInfo::getExpand();
+      }  else if (!Ty->isBuiltinType() && !Ty->isVectorType()) {
+        // HVA Types only get registers after everything else has been
+        // set, so it gets set as indirect for now.
+        return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty));
+      }
     }
-    return getIndirectResult(Ty, /*ByVal=*/false, State);
   }
 
   if (isAggregateTypeForABI(Ty)) {
@@ -1604,6 +1657,36 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
   return ABIArgInfo::getDirect();
 }
 
+void X86_32ABIInfo::computeVectorCallArgs(CGFunctionInfo &FI, CCState &State,
+                                          bool &UsedInAlloca) const {
+  // Vectorcall only allows the first 6 parameters to be passed in registers,
+  // and homogeneous vector aggregates are only put into registers as a second
+  // priority.
+  unsigned Count = 0;
+  CCState ZeroState = State;
+  ZeroState.FreeRegs = ZeroState.FreeSSERegs = 0;
+  // HVAs must be done as a second priority for registers, so the deferred
+  // items are dealt with by going through the pattern a second time.
+  for (auto &I : FI.arguments()) {
+    if (Count < VectorcallMaxParamNumAsReg)
+      I.info = classifyArgumentType(I.type, State);
+    else
+      // Parameters after the 6th cannot be passed in registers,
+      // so pretend there are no registers left for them.
+      I.info = classifyArgumentType(I.type, ZeroState);
+    UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca);
+    ++Count;
+  }
+  Count = 0;
+  // Go through the arguments a second time to get HVAs registers if there
+  // are still some available.
+  for (auto &I : FI.arguments()) {
+    if (Count < VectorcallMaxParamNumAsReg)
+      I.info = reclassifyHvaArgType(I.type, State, I.info);
+    ++Count;
+  }
+}
+
 void X86_32ABIInfo::computeInfo(CGFunctionInfo &FI) const {
   CCState State(FI.getCallingConvention());
   if (IsMCUABI)
@@ -1638,9 +1721,14 @@ void X86_32ABIInfo::computeInfo(CGFunctionInfo &FI) const {
     ++State.FreeRegs;
 
   bool UsedInAlloca = false;
-  for (auto &I : FI.arguments()) {
-    I.info = classifyArgumentType(I.type, State);
-    UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca);
+  if (State.CC == llvm::CallingConv::X86_VectorCall) {
+    computeVectorCallArgs(FI, State, UsedInAlloca);
+  } else {
+    // If not vectorcall, revert to normal behavior.
+    for (auto &I : FI.arguments()) {
+      I.info = classifyArgumentType(I.type, State);
+      UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca);
+    }
   }
 
   // If we needed to use inalloca for any argument, do a second pass and rewrite
@@ -2070,10 +2158,14 @@ public:
   }
 
 private:
-  ABIArgInfo classify(QualType Ty, unsigned &FreeSSERegs,
-                      bool IsReturnType) const;
-
-  bool IsMingw64;
+  ABIArgInfo classify(QualType Ty, unsigned &FreeSSERegs, bool IsReturnType,
+                      bool IsVectorCall, bool IsRegCall) const;
+  ABIArgInfo reclassifyHvaArgType(QualType Ty, unsigned &FreeSSERegs,
+                                      const ABIArgInfo &current) const;
+  void computeVectorCallArgs(CGFunctionInfo &FI, unsigned FreeSSERegs,
+                             bool IsVectorCall, bool IsRegCall) const;
+
+    bool IsMingw64;
 };
 
 class X86_64TargetCodeGenInfo : public TargetCodeGenInfo {
@@ -3679,8 +3771,24 @@ Address X86_64ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr,
                           /*allowHigherAlign*/ false);
 }
 
+ABIArgInfo
+WinX86_64ABIInfo::reclassifyHvaArgType(QualType Ty, unsigned &FreeSSERegs,
+                                    const ABIArgInfo &current) const {
+  // Assumes vectorCall calling convention.
+  const Type *Base = nullptr;
+  uint64_t NumElts = 0;
+
+  if (!Ty->isBuiltinType() && !Ty->isVectorType() &&
+      isHomogeneousAggregate(Ty, Base, NumElts) && FreeSSERegs >= NumElts) {
+    FreeSSERegs -= NumElts;
+    return getDirectX86Hva();
+  }
+  return current;
+}
+
 ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
-                                      bool IsReturnType) const {
+                                      bool IsReturnType, bool IsVectorCall,
+                                      bool IsRegCall) const {
 
   if (Ty->isVoidType())
     return ABIArgInfo::getIgnore();
@@ -3704,21 +3812,34 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
 
   }
 
-  // vectorcall adds the concept of a homogenous vector aggregate, similar to
-  // other targets.
   const Type *Base = nullptr;
   uint64_t NumElts = 0;
-  if (FreeSSERegs && isHomogeneousAggregate(Ty, Base, NumElts)) {
-    if (FreeSSERegs >= NumElts) {
-      FreeSSERegs -= NumElts;
-      if (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType())
+  // vectorcall adds the concept of a homogenous vector aggregate, similar to
+  // other targets.
+  if ((IsVectorCall || IsRegCall) &&
+      isHomogeneousAggregate(Ty, Base, NumElts)) {
+    if (IsRegCall) {
+      if (FreeSSERegs >= NumElts) {
+        FreeSSERegs -= NumElts;
+        if (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType())
+          return ABIArgInfo::getDirect();
+        return ABIArgInfo::getExpand();
+      }
+      return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+    } else if (IsVectorCall) {
+      if (FreeSSERegs >= NumElts &&
+          (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType())) {
+        FreeSSERegs -= NumElts;
         return ABIArgInfo::getDirect();
-      return ABIArgInfo::getExpand();
+      } else if (IsReturnType) {
+        return ABIArgInfo::getExpand();
+      } else if (!Ty->isBuiltinType() && !Ty->isVectorType()) {
+        // HVAs are delayed and reclassified in the 2nd step.
+        return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
+      }
     }
-    return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
   }
 
-
   if (Ty->isMemberPointerType()) {
     // If the member pointer is represented by an LLVM int or ptr, pass it
     // directly.
@@ -3754,6 +3875,32 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
   return ABIArgInfo::getDirect();
 }
 
+void WinX86_64ABIInfo::computeVectorCallArgs(CGFunctionInfo &FI,
+                                             unsigned FreeSSERegs,
+                                             bool IsVectorCall,
+                                             bool IsRegCall) const {
+  unsigned Count = 0;
+  for (auto &I : FI.arguments()) {
+    if (Count < VectorcallMaxParamNumAsReg)
+      I.info = classify(I.type, FreeSSERegs, false, IsVectorCall, IsRegCall);
+    else {
+      // Since these cannot be passed in registers, pretend no registers
+      // are left.
+      unsigned ZeroSSERegsAvail = 0;
+      I.info = classify(I.type, /*FreeSSERegs=*/ZeroSSERegsAvail, false,
+                        IsVectorCall, IsRegCall);
+    }
+    ++Count;
+  }
+
+  Count = 0;
+  for (auto &I : FI.arguments()) {
+    if (Count < VectorcallMaxParamNumAsReg)
+      I.info = reclassifyHvaArgType(I.type, FreeSSERegs, I.info);
+    ++Count;
+  }
+}
+
 void WinX86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const {
   bool IsVectorCall =
       FI.getCallingConvention() == llvm::CallingConv::X86_VectorCall;
@@ -3769,17 +3916,24 @@ void WinX86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const {
   }
 
   if (!getCXXABI().classifyReturnType(FI))
-    FI.getReturnInfo() = classify(FI.getReturnType(), FreeSSERegs, true);
+    FI.getReturnInfo() = classify(FI.getReturnType(), FreeSSERegs, true,
+                                  IsVectorCall, IsRegCall);
 
   if (IsVectorCall) {
     // We can use up to 6 SSE register parameters with vectorcall.
     FreeSSERegs = 6;
   } else if (IsRegCall) {
+    // RegCall gives us 16 SSE registers, we can reuse the return registers.
     FreeSSERegs = 16;
   }
 
-  for (auto &I : FI.arguments())
-    I.info = classify(I.type, FreeSSERegs, false);
+  if (IsVectorCall) {
+    computeVectorCallArgs(FI, FreeSSERegs, IsVectorCall, IsRegCall);
+  } else {
+    for (auto &I : FI.arguments())
+      I.info = classify(I.type, FreeSSERegs, false, IsVectorCall, IsRegCall);
+  }
+
 }
 
 Address WinX86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp
index 7bd43ac9da2f..15f830d029eb 100644
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@@ -3764,6 +3764,9 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
       case llvm::Triple::wasm64:
         TC = new toolchains::WebAssembly(*this, Target, Args);
         break;
+      case llvm::Triple::avr:
+        TC = new toolchains::AVRToolChain(*this, Target, Args);
+        break;
       default:
         if (Target.getVendor() == llvm::Triple::Myriad)
           TC = new toolchains::MyriadToolChain(*this, Target, Args);
diff --git a/lib/Driver/MSVCToolChain.cpp b/lib/Driver/MSVCToolChain.cpp
index 95cf056f7a74..17fd6ac6f714 100644
--- a/lib/Driver/MSVCToolChain.cpp
+++ b/lib/Driver/MSVCToolChain.cpp
@@ -47,9 +47,9 @@ using namespace clang::driver::toolchains;
 using namespace clang;
 using namespace llvm::opt;
 
-MSVCToolChain::MSVCToolChain(const Driver &D, const llvm::Triple& Triple,
+MSVCToolChain::MSVCToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ArgList &Args)
-  : ToolChain(D, Triple, Args) {
+    : ToolChain(D, Triple, Args), CudaInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().getInstalledDir());
   if (getDriver().getInstalledDir() != getDriver().Dir)
     getProgramPaths().push_back(getDriver().Dir);
@@ -94,6 +94,15 @@ bool MSVCToolChain::isPICDefaultForced() const {
   return getArch() == llvm::Triple::x86_64;
 }
 
+void MSVCToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
+                                       ArgStringList &CC1Args) const {
+  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
+}
+
+void MSVCToolChain::printVerboseInfo(raw_ostream &OS) const {
+  CudaInstallation.print(OS);
+}
+
 #ifdef USE_WIN32
 static bool readFullStringValue(HKEY hkey, const char *valueName,
                                 std::string &value) {
diff --git a/lib/Driver/MinGWToolChain.cpp b/lib/Driver/MinGWToolChain.cpp
index 938440b08f60..e971869fb569 100644
--- a/lib/Driver/MinGWToolChain.cpp
+++ b/lib/Driver/MinGWToolChain.cpp
@@ -20,10 +20,9 @@ using namespace clang::driver::toolchains;
 using namespace clang;
 using namespace llvm::opt;
 
-namespace {
 // Simplified from Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple.
-bool findGccVersion(StringRef LibDir, std::string &GccLibDir,
-                    std::string &Ver) {
+static bool findGccVersion(StringRef LibDir, std::string &GccLibDir,
+                           std::string &Ver) {
   Generic_GCC::GCCVersion Version = Generic_GCC::GCCVersion::Parse("0.0.0");
   std::error_code EC;
   for (llvm::sys::fs::directory_iterator LI(LibDir, EC), LE; !EC && LI != LE;
@@ -40,7 +39,6 @@ bool findGccVersion(StringRef LibDir, std::string &GccLibDir,
   }
   return Ver.size();
 }
-}
 
 void MinGW::findGccLibDir() {
   llvm::SmallVector<llvm::SmallString<32>, 2> Archs;
@@ -63,7 +61,7 @@ void MinGW::findGccLibDir() {
 }
 
 MinGW::MinGW(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
-    : ToolChain(D, Triple, Args) {
+    : ToolChain(D, Triple, Args), CudaInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().getInstalledDir());
 
 // In Windows there aren't any standard install locations, we search
@@ -135,6 +133,15 @@ bool MinGW::UseSEHExceptions() const {
   return getArch() == llvm::Triple::x86_64;
 }
 
+void MinGW::AddCudaIncludeArgs(const ArgList &DriverArgs,
+                               ArgStringList &CC1Args) const {
+  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
+}
+
+void MinGW::printVerboseInfo(raw_ostream &OS) const {
+  CudaInstallation.print(OS);
+}
+
 // Include directories for various hosts:
 
 // Windows, mingw.org
diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp
index 968b0cb4724a..789a2f0525be 100644
--- a/lib/Driver/ToolChains.cpp
+++ b/lib/Driver/ToolChains.cpp
@@ -1805,19 +1805,26 @@ static CudaVersion ParseCudaVersionFile(llvm::StringRef V) {
 }
 
 CudaInstallationDetector::CudaInstallationDetector(
-    const Driver &D, const llvm::Triple &TargetTriple,
+    const Driver &D, const llvm::Triple &HostTriple,
     const llvm::opt::ArgList &Args)
     : D(D) {
   SmallVector<std::string, 4> CudaPathCandidates;
 
-  if (Args.hasArg(options::OPT_cuda_path_EQ))
+  // In decreasing order so we prefer newer versions to older versions.
+  std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};
+
+  if (Args.hasArg(options::OPT_cuda_path_EQ)) {
     CudaPathCandidates.push_back(
         Args.getLastArgValue(options::OPT_cuda_path_EQ));
-  else {
+  } else if (HostTriple.isOSWindows()) {
+    for (const char *Ver : Versions)
+      CudaPathCandidates.push_back(
+          D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
+          Ver);
+  } else {
     CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda");
-    CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-8.0");
-    CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.5");
-    CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.0");
+    for (const char *Ver : Versions)
+      CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-" + Ver);
   }
 
   for (const auto &CudaPath : CudaPathCandidates) {
@@ -1840,7 +1847,7 @@ CudaInstallationDetector::CudaInstallationDetector(
     // It's sufficient for our purposes to be flexible: If both lib and lib64
     // exist, we choose whichever one matches our triple.  Otherwise, if only
     // lib exists, we use it.
-    if (TargetTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64"))
+    if (HostTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64"))
       LibPath = InstallPath + "/lib64";
     else if (FS.exists(InstallPath + "/lib"))
       LibPath = InstallPath + "/lib";
@@ -4870,7 +4877,7 @@ Tool *DragonFly::buildLinker() const {
 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ToolChain &HostTC, const ArgList &Args)
     : ToolChain(D, Triple, Args), HostTC(HostTC),
-      CudaInstallation(D, Triple, Args) {
+      CudaInstallation(D, HostTC.getTriple(), Args) {
   if (CudaInstallation.isValid())
     getProgramPaths().push_back(CudaInstallation.getBinPath());
 }
@@ -5021,6 +5028,11 @@ SanitizerMask CudaToolChain::getSupportedSanitizers() const {
   return HostTC.getSupportedSanitizers();
 }
 
+VersionTuple CudaToolChain::computeMSVCVersion(const Driver *D,
+                                               const ArgList &Args) const {
+  return HostTC.computeMSVCVersion(D, Args);
+}
+
 /// XCore tool chain
 XCoreToolChain::XCoreToolChain(const Driver &D, const llvm::Triple &Triple,
                                const ArgList &Args)
@@ -5318,3 +5330,12 @@ SanitizerMask Contiki::getSupportedSanitizers() const {
     Res |= SanitizerKind::SafeStack;
   return Res;
 }
+
+/// AVR Toolchain
+AVRToolChain::AVRToolChain(const Driver &D, const llvm::Triple &Triple,
+                           const ArgList &Args)
+  : Generic_ELF(D, Triple, Args) { }
+Tool *AVRToolChain::buildLinker() const {
+  return new tools::AVR::Linker(*this);
+}
+// End AVR
diff --git a/lib/Driver/ToolChains.h b/lib/Driver/ToolChains.h
index 7dab08915d48..3240357ba6b1 100644
--- a/lib/Driver/ToolChains.h
+++ b/lib/Driver/ToolChains.h
@@ -43,7 +43,7 @@ private:
   mutable llvm::SmallSet<CudaArch, 4> ArchsWithVersionTooLowErrors;
 
 public:
-  CudaInstallationDetector(const Driver &D, const llvm::Triple &Triple,
+  CudaInstallationDetector(const Driver &D, const llvm::Triple &HostTriple,
                            const llvm::opt::ArgList &Args);
 
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
@@ -709,12 +709,19 @@ public:
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
 
+  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+
+  void printVerboseInfo(raw_ostream &OS) const override;
+
 protected:
   Tool *getTool(Action::ActionClass AC) const override;
   Tool *buildLinker() const override;
   Tool *buildAssembler() const override;
 
 private:
+  CudaInstallationDetector CudaInstallation;
+
   std::string Base;
   std::string GccLibDir;
   std::string Ver;
@@ -892,6 +899,10 @@ public:
   CudaToolChain(const Driver &D, const llvm::Triple &Triple,
                 const ToolChain &HostTC, const llvm::opt::ArgList &Args);
 
+  virtual const llvm::Triple *getAuxTriple() const override {
+    return &HostTC.getTriple();
+  }
+
   llvm::opt::DerivedArgList *
   TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch,
                 Action::OffloadKind DeviceOffloadKind) const override;
@@ -924,6 +935,10 @@ public:
 
   SanitizerMask getSupportedSanitizers() const override;
 
+  VersionTuple
+  computeMSVCVersion(const Driver *D,
+                     const llvm::opt::ArgList &Args) const override;
+
   const ToolChain &HostTC;
   CudaInstallationDetector CudaInstallation;
 
@@ -1147,6 +1162,9 @@ public:
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
 
+  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+
   bool getWindowsSDKDir(std::string &path, int &major,
                         std::string &windowsSDKIncludeVersion,
                         std::string &windowsSDKLibVersion) const;
@@ -1166,6 +1184,8 @@ public:
                                           types::ID InputType) const override;
   SanitizerMask getSupportedSanitizers() const override;
 
+  void printVerboseInfo(raw_ostream &OS) const override;
+
 protected:
   void AddSystemIncludeWithSubfolder(const llvm::opt::ArgList &DriverArgs,
                                      llvm::opt::ArgStringList &CC1Args,
@@ -1179,6 +1199,8 @@ protected:
 private:
   VersionTuple getMSVCVersionFromTriple() const;
   VersionTuple getMSVCVersionFromExe() const;
+
+  CudaInstallationDetector CudaInstallation;
 };
 
 class LLVM_LIBRARY_VISIBILITY CrossWindowsToolChain : public Generic_GCC {
@@ -1349,6 +1371,16 @@ public:
   SanitizerMask getSupportedSanitizers() const override;
 };
 
+class LLVM_LIBRARY_VISIBILITY AVRToolChain : public Generic_ELF {
+protected:
+  Tool *buildLinker() const override;
+public:
+  AVRToolChain(const Driver &D, const llvm::Triple &Triple,
+               const llvm::opt::ArgList &Args);
+  bool IsIntegratedAssemblerDefault() const override { return true; }
+};
+
+
 } // end namespace toolchains
 } // end namespace driver
 } // end namespace clang
diff --git a/lib/Driver/Tools.cpp b/lib/Driver/Tools.cpp
index ea5ad7d051b6..8e02d45fcc4a 100644
--- a/lib/Driver/Tools.cpp
+++ b/lib/Driver/Tools.cpp
@@ -4086,13 +4086,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
-  bool IsWindowsGNU = getToolChain().getTriple().isWindowsGNUEnvironment();
-  bool IsWindowsCygnus =
-      getToolChain().getTriple().isWindowsCygwinEnvironment();
-  bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment();
-  bool IsPS4CPU = getToolChain().getTriple().isPS4CPU();
-  bool IsIAMCU = getToolChain().getTriple().isOSIAMCU();
-
   // Check number of inputs for sanity. We need at least one input.
   assert(Inputs.size() >= 1 && "Must have at least one input.");
   const InputInfo &Input = Inputs[0];
@@ -4106,6 +4099,23 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
           Inputs.size() == 1) &&
          "Unable to handle multiple inputs.");
 
+  bool IsWindowsGNU = getToolChain().getTriple().isWindowsGNUEnvironment();
+  bool IsWindowsCygnus =
+      getToolChain().getTriple().isWindowsCygwinEnvironment();
+  bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment();
+  bool IsPS4CPU = getToolChain().getTriple().isPS4CPU();
+  bool IsIAMCU = getToolChain().getTriple().isOSIAMCU();
+
+  // Adjust IsWindowsXYZ for CUDA compilations.  Even when compiling in device
+  // mode (i.e., getToolchain().getTriple() is NVPTX, not Windows), we need to
+  // pass Windows-specific flags to cc1.
+  if (IsCuda) {
+    const llvm::Triple *AuxTriple = getToolChain().getAuxTriple();
+    IsWindowsMSVC |= AuxTriple && AuxTriple->isWindowsMSVCEnvironment();
+    IsWindowsGNU |= AuxTriple && AuxTriple->isWindowsGNUEnvironment();
+    IsWindowsCygnus |= AuxTriple && AuxTriple->isWindowsCygwinEnvironment();
+  }
+
   // C++ is not supported for IAMCU.
   if (IsIAMCU && types::isCXX(Input.getType()))
     D.Diag(diag::err_drv_clang_unsupported) << "C++ for IAMCU";
@@ -12191,3 +12201,19 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }
+
+void AVR::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+                               const InputInfo &Output,
+                               const InputInfoList &Inputs,
+                               const ArgList &Args,
+                               const char *LinkingOutput) const {
+
+  std::string Linker = getToolChain().GetProgramPath(getShortName());
+  ArgStringList CmdArgs;
+  AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
+  CmdArgs.push_back("-o");
+  CmdArgs.push_back(Output.getFilename());
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Args.MakeArgString(Linker),
+                                          CmdArgs, Inputs));
+}
+// AVR tools end.
diff --git a/lib/Driver/Tools.h b/lib/Driver/Tools.h
index 98dcf841169e..9d5b892d424c 100644
--- a/lib/Driver/Tools.h
+++ b/lib/Driver/Tools.h
@@ -990,6 +990,19 @@ class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
 
 }  // end namespace NVPTX
 
+namespace AVR {
+class LLVM_LIBRARY_VISIBILITY Linker : public GnuTool {
+public:
+  Linker(const ToolChain &TC) : GnuTool("AVR::Linker", "avr-ld", TC) {}
+  bool hasIntegratedCPP() const override { return false; }
+  bool isLinkJob() const override { return true; }
+  void ConstructJob(Compilation &C, const JobAction &JA,
+                    const InputInfo &Output, const InputInfoList &Inputs,
+                    const llvm::opt::ArgList &TCArgs,
+                    const char *LinkingOutput) const override;
+};
+} // end namespace AVR
+
 } // end namespace tools
 } // end namespace driver
 } // end namespace clang
diff --git a/lib/Frontend/ASTUnit.cpp b/lib/Frontend/ASTUnit.cpp
index 32ce966f798e..d8929969e6c1 100644
--- a/lib/Frontend/ASTUnit.cpp
+++ b/lib/Frontend/ASTUnit.cpp
@@ -245,7 +245,7 @@ ASTUnit::~ASTUnit() {
   // perform this operation here because we explicitly request that the
   // compiler instance *not* free these buffers for each invocation of the
   // parser.
-  if (Invocation.get() && OwnsRemappedFileBuffers) {
+  if (Invocation && OwnsRemappedFileBuffers) {
     PreprocessorOptions &PPOpts = Invocation->getPreprocessorOpts();
     for (const auto &RB : PPOpts.RemappedFileBuffers)
       delete RB.second;
@@ -257,7 +257,9 @@ ASTUnit::~ASTUnit() {
     fprintf(stderr, "--- %u translation units\n", --ActiveASTUnitObjects);
 }
 
-void ASTUnit::setPreprocessor(Preprocessor *pp) { PP = pp; }
+void ASTUnit::setPreprocessor(std::shared_ptr<Preprocessor> PP) {
+  this->PP = std::move(PP);
+}
 
 /// \brief Determine the set of code-completion contexts in which this 
 /// declaration should be shown.
@@ -346,7 +348,7 @@ void ASTUnit::CacheCodeCompletionResults() {
   // Gather the set of global code completions.
   typedef CodeCompletionResult Result;
   SmallVector<Result, 8> Results;
-  CachedCompletionAllocator = new GlobalCodeCompletionAllocator;
+  CachedCompletionAllocator = std::make_shared<GlobalCodeCompletionAllocator>();
   CodeCompletionTUInfo CCTUInfo(CachedCompletionAllocator);
   TheSema->GatherGlobalCodeCompletions(*CachedCompletionAllocator,
                                        CCTUInfo, Results);
@@ -675,7 +677,7 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
   AST->SourceMgr = new SourceManager(AST->getDiagnostics(),
                                      AST->getFileManager(),
                                      UserFilesAreVolatile);
-  AST->HSOpts = new HeaderSearchOptions();
+  AST->HSOpts = std::make_shared<HeaderSearchOptions>();
   AST->HSOpts->ModuleFormat = PCHContainerRdr.getFormat();
   AST->HeaderInfo.reset(new HeaderSearch(AST->HSOpts,
                                          AST->getSourceManager(),
@@ -683,7 +685,7 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
                                          AST->ASTFileLangOpts,
                                          /*Target=*/nullptr));
 
-  PreprocessorOptions *PPOpts = new PreprocessorOptions();
+  auto PPOpts = std::make_shared<PreprocessorOptions>();
 
   for (const auto &RemappedFile : RemappedFiles)
     PPOpts->addRemappedFile(RemappedFile.first, RemappedFile.second);
@@ -693,11 +695,11 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
   HeaderSearch &HeaderInfo = *AST->HeaderInfo;
   unsigned Counter;
 
-  AST->PP =
-      new Preprocessor(PPOpts, AST->getDiagnostics(), AST->ASTFileLangOpts,
-                       AST->getSourceManager(), HeaderInfo, *AST,
-                       /*IILookup=*/nullptr,
-                       /*OwnsHeaderSearch=*/false);
+  AST->PP = std::make_shared<Preprocessor>(
+      std::move(PPOpts), AST->getDiagnostics(), AST->ASTFileLangOpts,
+      AST->getSourceManager(), HeaderInfo, *AST,
+      /*IILookup=*/nullptr,
+      /*OwnsHeaderSearch=*/false);
   Preprocessor &PP = *AST->PP;
 
   AST->Ctx = new ASTContext(AST->ASTFileLangOpts, AST->getSourceManager(),
@@ -926,7 +928,7 @@ public:
                              const Preprocessor &PP, StringRef isysroot,
                              std::unique_ptr<raw_ostream> Out)
       : PCHGenerator(PP, "", isysroot, std::make_shared<PCHBuffer>(),
-                     ArrayRef<llvm::IntrusiveRefCntPtr<ModuleFileExtension>>(),
+                     ArrayRef<std::shared_ptr<ModuleFileExtension>>(),
                      /*AllowASTWithErrors=*/true),
         Unit(Unit), Hash(Unit.getCurrentTopLevelHashValue()), Action(Action),
         Out(std::move(Out)) {
@@ -1046,10 +1048,7 @@ bool ASTUnit::Parse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
   llvm::CrashRecoveryContextCleanupRegistrar<CompilerInstance>
     CICleanup(Clang.get());
 
-  IntrusiveRefCntPtr<CompilerInvocation>
-    CCInvocation(new CompilerInvocation(*Invocation));
-
-  Clang->setInvocation(CCInvocation.get());
+  Clang->setInvocation(std::make_shared<CompilerInvocation>(*Invocation));
   OriginalSourceFile = Clang->getFrontendOpts().Inputs[0].getFile();
     
   // Set up diagnostics, capturing any diagnostics that would
@@ -1342,8 +1341,8 @@ ASTUnit::getMainBufferWithPrecompiledPreamble(
     const CompilerInvocation &PreambleInvocationIn, bool AllowRebuild,
     unsigned MaxLines) {
 
-  IntrusiveRefCntPtr<CompilerInvocation>
-    PreambleInvocation(new CompilerInvocation(PreambleInvocationIn));
+  auto PreambleInvocation =
+      std::make_shared<CompilerInvocation>(PreambleInvocationIn);
   FrontendOptions &FrontendOpts = PreambleInvocation->getFrontendOpts();
   PreprocessorOptions &PreprocessorOpts
     = PreambleInvocation->getPreprocessorOpts();
@@ -1521,7 +1520,7 @@ ASTUnit::getMainBufferWithPrecompiledPreamble(
   llvm::CrashRecoveryContextCleanupRegistrar<CompilerInstance>
     CICleanup(Clang.get());
 
-  Clang->setInvocation(&*PreambleInvocation);
+  Clang->setInvocation(std::move(PreambleInvocation));
   OriginalSourceFile = Clang->getFrontendOpts().Inputs[0].getFile();
   
   // Set up diagnostics, capturing all of the diagnostics produced.
@@ -1671,7 +1670,7 @@ void ASTUnit::transferASTDataFromCompilerInstance(CompilerInstance &CI) {
   if (CI.hasASTContext())
     Ctx = &CI.getASTContext();
   if (CI.hasPreprocessor())
-    PP = &CI.getPreprocessor();
+    PP = CI.getPreprocessorPtr();
   CI.setSourceManager(nullptr);
   CI.setFileManager(nullptr);
   if (CI.hasTarget())
@@ -1707,30 +1706,29 @@ StringRef ASTUnit::getASTFileName() const {
   return Mod.FileName;
 }
 
-ASTUnit *ASTUnit::create(CompilerInvocation *CI,
-                         IntrusiveRefCntPtr<DiagnosticsEngine> Diags,
-                         bool CaptureDiagnostics,
-                         bool UserFilesAreVolatile) {
-  std::unique_ptr<ASTUnit> AST;
-  AST.reset(new ASTUnit(false));
+std::unique_ptr<ASTUnit>
+ASTUnit::create(std::shared_ptr<CompilerInvocation> CI,
+                IntrusiveRefCntPtr<DiagnosticsEngine> Diags,
+                bool CaptureDiagnostics, bool UserFilesAreVolatile) {
+  std::unique_ptr<ASTUnit> AST(new ASTUnit(false));
   ConfigureDiags(Diags, *AST, CaptureDiagnostics);
-  AST->Diagnostics = Diags;
-  AST->Invocation = CI;
-  AST->FileSystemOpts = CI->getFileSystemOpts();
   IntrusiveRefCntPtr<vfs::FileSystem> VFS =
       createVFSFromCompilerInvocation(*CI, *Diags);
   if (!VFS)
     return nullptr;
+  AST->Diagnostics = Diags;
+  AST->FileSystemOpts = CI->getFileSystemOpts();
+  AST->Invocation = std::move(CI);
   AST->FileMgr = new FileManager(AST->FileSystemOpts, VFS);
   AST->UserFilesAreVolatile = UserFilesAreVolatile;
   AST->SourceMgr = new SourceManager(AST->getDiagnostics(), *AST->FileMgr,
                                      UserFilesAreVolatile);
 
-  return AST.release();
+  return AST;
 }
 
 ASTUnit *ASTUnit::LoadFromCompilerInvocationAction(
-    CompilerInvocation *CI,
+    std::shared_ptr<CompilerInvocation> CI,
     std::shared_ptr<PCHContainerOperations> PCHContainerOps,
     IntrusiveRefCntPtr<DiagnosticsEngine> Diags, FrontendAction *Action,
     ASTUnit *Unit, bool Persistent, StringRef ResourceFilesPath,
@@ -1744,7 +1742,7 @@ ASTUnit *ASTUnit::LoadFromCompilerInvocationAction(
   ASTUnit *AST = Unit;
   if (!AST) {
     // Create the AST unit.
-    OwnAST.reset(create(CI, Diags, CaptureDiagnostics, UserFilesAreVolatile));
+    OwnAST = create(CI, Diags, CaptureDiagnostics, UserFilesAreVolatile);
     AST = OwnAST.get();
     if (!AST)
       return nullptr;
@@ -1783,7 +1781,7 @@ ASTUnit *ASTUnit::LoadFromCompilerInvocationAction(
   llvm::CrashRecoveryContextCleanupRegistrar<CompilerInstance>
     CICleanup(Clang.get());
 
-  Clang->setInvocation(CI);
+  Clang->setInvocation(std::move(CI));
   AST->OriginalSourceFile = Clang->getFrontendOpts().Inputs[0].getFile();
     
   // Set up diagnostics, capturing any diagnostics that would
@@ -1901,7 +1899,7 @@ bool ASTUnit::LoadFromCompilerInvocation(
 }
 
 std::unique_ptr<ASTUnit> ASTUnit::LoadFromCompilerInvocation(
-    CompilerInvocation *CI,
+    std::shared_ptr<CompilerInvocation> CI,
     std::shared_ptr<PCHContainerOperations> PCHContainerOps,
     IntrusiveRefCntPtr<DiagnosticsEngine> Diags, FileManager *FileMgr,
     bool OnlyLocalDecls, bool CaptureDiagnostics,
@@ -1918,7 +1916,7 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromCompilerInvocation(
   AST->ShouldCacheCodeCompletionResults = CacheCodeCompletionResults;
   AST->IncludeBriefCommentsInCodeCompletion
     = IncludeBriefCommentsInCodeCompletion;
-  AST->Invocation = CI;
+  AST->Invocation = std::move(CI);
   AST->FileSystemOpts = FileMgr->getFileSystemOpts();
   AST->FileMgr = FileMgr;
   AST->UserFilesAreVolatile = UserFilesAreVolatile;
@@ -1950,8 +1948,8 @@ ASTUnit *ASTUnit::LoadFromCommandLine(
   assert(Diags.get() && "no DiagnosticsEngine was provided");
 
   SmallVector<StoredDiagnostic, 4> StoredDiagnostics;
-  
-  IntrusiveRefCntPtr<CompilerInvocation> CI;
+
+  std::shared_ptr<CompilerInvocation> CI;
 
   {
 
@@ -1959,8 +1957,7 @@ ASTUnit *ASTUnit::LoadFromCommandLine(
                                       StoredDiagnostics);
 
     CI = clang::createInvocationFromCommandLine(
-                                           llvm::makeArrayRef(ArgBegin, ArgEnd),
-                                           Diags);
+        llvm::makeArrayRef(ArgBegin, ArgEnd), Diags);
     if (!CI)
       return nullptr;
   }
@@ -2331,8 +2328,7 @@ void ASTUnit::CodeComplete(
   CompletionTimer.setOutput("Code completion @ " + File + ":" +
                             Twine(Line) + ":" + Twine(Column));
 
-  IntrusiveRefCntPtr<CompilerInvocation>
-    CCInvocation(new CompilerInvocation(*Invocation));
+  auto CCInvocation = std::make_shared<CompilerInvocation>(*Invocation);
 
   FrontendOptions &FrontendOpts = CCInvocation->getFrontendOpts();
   CodeCompleteOptions &CodeCompleteOpts = FrontendOpts.CodeCompleteOpts;
@@ -2364,7 +2360,8 @@ void ASTUnit::CodeComplete(
   llvm::CrashRecoveryContextCleanupRegistrar<CompilerInstance>
     CICleanup(Clang.get());
 
-  Clang->setInvocation(&*CCInvocation);
+  auto &Inv = *CCInvocation;
+  Clang->setInvocation(std::move(CCInvocation));
   OriginalSourceFile = Clang->getFrontendOpts().Inputs[0].getFile();
     
   // Set up diagnostics, capturing any diagnostics produced.
@@ -2372,8 +2369,8 @@ void ASTUnit::CodeComplete(
   CaptureDroppedDiagnostics Capture(true, 
                                     Clang->getDiagnostics(), 
                                     StoredDiagnostics);
-  ProcessWarningOptions(Diag, CCInvocation->getDiagnosticOpts());
-  
+  ProcessWarningOptions(Diag, Inv.getDiagnosticOpts());
+
   // Create the target instance.
   Clang->setTarget(TargetInfo::CreateTargetInfo(
       Clang->getDiagnostics(), Clang->getInvocation().TargetOpts));
@@ -2429,7 +2426,7 @@ void ASTUnit::CodeComplete(
       if (!llvm::sys::fs::getUniqueID(MainPath, MainID)) {
         if (CompleteFileID == MainID && Line > 1)
           OverrideMainBuffer = getMainBufferWithPrecompiledPreamble(
-              PCHContainerOps, *CCInvocation, false, Line - 1);
+              PCHContainerOps, Inv, false, Line - 1);
       }
     }
   }
diff --git a/lib/Frontend/ChainedIncludesSource.cpp b/lib/Frontend/ChainedIncludesSource.cpp
index c5b77ee90e56..b984c2ed0dd5 100644
--- a/lib/Frontend/ChainedIncludesSource.cpp
+++ b/lib/Frontend/ChainedIncludesSource.cpp
@@ -147,7 +147,7 @@ IntrusiveRefCntPtr<ExternalSemaSource> clang::createChainedIncludesSource(
 
     std::unique_ptr<CompilerInstance> Clang(
         new CompilerInstance(CI.getPCHContainerOperations()));
-    Clang->setInvocation(CInvok.release());
+    Clang->setInvocation(std::move(CInvok));
     Clang->setDiagnostics(Diags.get());
     Clang->setTarget(TargetInfo::CreateTargetInfo(
         Clang->getDiagnostics(), Clang->getInvocation().TargetOpts));
@@ -159,7 +159,7 @@ IntrusiveRefCntPtr<ExternalSemaSource> clang::createChainedIncludesSource(
     Clang->createASTContext();
 
     auto Buffer = std::make_shared<PCHBuffer>();
-    ArrayRef<llvm::IntrusiveRefCntPtr<ModuleFileExtension>> Extensions;
+    ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions;
     auto consumer = llvm::make_unique<PCHGenerator>(
         Clang->getPreprocessor(), "-", /*isysroot=*/"", Buffer,
         Extensions, /*AllowASTWithErrors=*/true);
diff --git a/lib/Frontend/CompilerInstance.cpp b/lib/Frontend/CompilerInstance.cpp
index ccddd14f0f34..afcaa6e87878 100644
--- a/lib/Frontend/CompilerInstance.cpp
+++ b/lib/Frontend/CompilerInstance.cpp
@@ -66,8 +66,9 @@ CompilerInstance::~CompilerInstance() {
   assert(OutputFiles.empty() && "Still output files in flight?");
 }
 
-void CompilerInstance::setInvocation(CompilerInvocation *Value) {
-  Invocation = Value;
+void CompilerInstance::setInvocation(
+    std::shared_ptr<CompilerInvocation> Value) {
+  Invocation = std::move(Value);
 }
 
 bool CompilerInstance::shouldBuildGlobalModuleIndex() const {
@@ -96,7 +97,9 @@ void CompilerInstance::setSourceManager(SourceManager *Value) {
   SourceMgr = Value;
 }
 
-void CompilerInstance::setPreprocessor(Preprocessor *Value) { PP = Value; }
+void CompilerInstance::setPreprocessor(std::shared_ptr<Preprocessor> Value) {
+  PP = std::move(Value);
+}
 
 void CompilerInstance::setASTContext(ASTContext *Value) {
   Context = Value;
@@ -365,14 +368,13 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
     PTHMgr = PTHManager::Create(PPOpts.TokenCache, getDiagnostics());
 
   // Create the Preprocessor.
-  HeaderSearch *HeaderInfo = new HeaderSearch(&getHeaderSearchOpts(),
-                                              getSourceManager(),
-                                              getDiagnostics(),
-                                              getLangOpts(),
-                                              &getTarget());
-  PP = new Preprocessor(&getPreprocessorOpts(), getDiagnostics(), getLangOpts(),
-                        getSourceManager(), *HeaderInfo, *this, PTHMgr,
-                        /*OwnsHeaderSearch=*/true, TUKind);
+  HeaderSearch *HeaderInfo =
+      new HeaderSearch(getHeaderSearchOptsPtr(), getSourceManager(),
+                       getDiagnostics(), getLangOpts(), &getTarget());
+  PP = std::make_shared<Preprocessor>(
+      Invocation->getPreprocessorOptsPtr(), getDiagnostics(), getLangOpts(),
+      getSourceManager(), *HeaderInfo, *this, PTHMgr,
+      /*OwnsHeaderSearch=*/true, TUKind);
   PP->Initialize(getTarget(), getAuxTarget());
 
   // Note that this is different then passing PTHMgr to Preprocessor's ctor.
@@ -498,7 +500,7 @@ IntrusiveRefCntPtr<ASTReader> CompilerInstance::createPCHExternalASTSource(
     StringRef Path, StringRef Sysroot, bool DisablePCHValidation,
     bool AllowPCHWithCompilerErrors, Preprocessor &PP, ASTContext &Context,
     const PCHContainerReader &PCHContainerRdr,
-    ArrayRef<IntrusiveRefCntPtr<ModuleFileExtension>> Extensions,
+    ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
     void *DeserializationListener, bool OwnDeserializationListener,
     bool Preamble, bool UseGlobalModuleIndex) {
   HeaderSearchOptions &HSOpts = PP.getHeaderSearchInfo().getHeaderSearchOpts();
@@ -1018,8 +1020,8 @@ static bool compileModuleImpl(CompilerInstance &ImportingInstance,
     = ImportingInstance.getPreprocessor().getHeaderSearchInfo().getModuleMap();
     
   // Construct a compiler invocation for creating this module.
-  IntrusiveRefCntPtr<CompilerInvocation> Invocation
-    (new CompilerInvocation(ImportingInstance.getInvocation()));
+  auto Invocation =
+      std::make_shared<CompilerInvocation>(ImportingInstance.getInvocation());
 
   PreprocessorOptions &PPOpts = Invocation->getPreprocessorOpts();
   
@@ -1049,7 +1051,8 @@ static bool compileModuleImpl(CompilerInstance &ImportingInstance,
   PreprocessorOptions &ImportingPPOpts
     = ImportingInstance.getInvocation().getPreprocessorOpts();
   if (!ImportingPPOpts.FailedModules)
-    ImportingPPOpts.FailedModules = new PreprocessorOptions::FailedModulesSet;
+    ImportingPPOpts.FailedModules =
+        std::make_shared<PreprocessorOptions::FailedModulesSet>();
   PPOpts.FailedModules = ImportingPPOpts.FailedModules;
 
   // If there is a module map file, build the module using the module map.
@@ -1074,7 +1077,8 @@ static bool compileModuleImpl(CompilerInstance &ImportingInstance,
   // module.
   CompilerInstance Instance(ImportingInstance.getPCHContainerOperations(),
                             /*BuildingModule=*/true);
-  Instance.setInvocation(&*Invocation);
+  auto &Inv = *Invocation;
+  Instance.setInvocation(std::move(Invocation));
 
   Instance.createDiagnostics(new ForwardingDiagnosticConsumer(
                                    ImportingInstance.getDiagnosticClient()),
@@ -1096,7 +1100,7 @@ static bool compileModuleImpl(CompilerInstance &ImportingInstance,
   // between all of the module CompilerInstances. Other than that, we don't
   // want to produce any dependency output from the module build.
   Instance.setModuleDepCollector(ImportingInstance.getModuleDepCollector());
-  Invocation->getDependencyOutputOpts() = DependencyOutputOptions();
+  Inv.getDependencyOutputOpts() = DependencyOutputOptions();
 
   // Get or create the module map that we'll use to build this module.
   std::string InferredModuleMapContent;
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp
index ca4a7655a37d..93bbcc42da1a 100644
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -60,12 +60,11 @@ CompilerInvocationBase::CompilerInvocationBase()
     PreprocessorOpts(new PreprocessorOptions()) {}
 
 CompilerInvocationBase::CompilerInvocationBase(const CompilerInvocationBase &X)
-  : RefCountedBase<CompilerInvocation>(),
-    LangOpts(new LangOptions(*X.getLangOpts())),
-    TargetOpts(new TargetOptions(X.getTargetOpts())),
-    DiagnosticOpts(new DiagnosticOptions(X.getDiagnosticOpts())),
-    HeaderSearchOpts(new HeaderSearchOptions(X.getHeaderSearchOpts())),
-    PreprocessorOpts(new PreprocessorOptions(X.getPreprocessorOpts())) {}
+    : LangOpts(new LangOptions(*X.getLangOpts())),
+      TargetOpts(new TargetOptions(X.getTargetOpts())),
+      DiagnosticOpts(new DiagnosticOptions(X.getDiagnosticOpts())),
+      HeaderSearchOpts(new HeaderSearchOptions(X.getHeaderSearchOpts())),
+      PreprocessorOpts(new PreprocessorOptions(X.getPreprocessorOpts())) {}
 
 CompilerInvocationBase::~CompilerInvocationBase() {}
 
@@ -1214,8 +1213,8 @@ static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
 
     // Add the testing module file extension.
     Opts.ModuleFileExtensions.push_back(
-      new TestModuleFileExtension(BlockName, MajorVersion, MinorVersion,
-                                  Hashed, UserInfo));
+        std::make_shared<TestModuleFileExtension>(
+            BlockName, MajorVersion, MinorVersion, Hashed, UserInfo));
   }
 
   if (const Arg *A = Args.getLastArg(OPT_code_completion_at)) {
diff --git a/lib/Frontend/CreateInvocationFromCommandLine.cpp b/lib/Frontend/CreateInvocationFromCommandLine.cpp
index 1e9e57afb6bd..16269064b6e1 100644
--- a/lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ b/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -30,9 +30,9 @@ using namespace llvm::opt;
 ///
 /// \return A CompilerInvocation, or 0 if none was built for the given
 /// argument vector.
-CompilerInvocation *
-clang::createInvocationFromCommandLine(ArrayRef<const char *> ArgList,
-                            IntrusiveRefCntPtr<DiagnosticsEngine> Diags) {
+std::unique_ptr<CompilerInvocation> clang::createInvocationFromCommandLine(
+    ArrayRef<const char *> ArgList,
+    IntrusiveRefCntPtr<DiagnosticsEngine> Diags) {
   if (!Diags.get()) {
     // No diagnostics engine was provided, so create our own diagnostics object
     // with the default options.
@@ -93,12 +93,12 @@ clang::createInvocationFromCommandLine(ArrayRef<const char *> ArgList,
   }
 
   const ArgStringList &CCArgs = Cmd.getArguments();
-  std::unique_ptr<CompilerInvocation> CI(new CompilerInvocation());
+  auto CI = llvm::make_unique<CompilerInvocation>();
   if (!CompilerInvocation::CreateFromArgs(*CI,
                                      const_cast<const char **>(CCArgs.data()),
                                      const_cast<const char **>(CCArgs.data()) +
                                      CCArgs.size(),
                                      *Diags))
     return nullptr;
-  return CI.release();
+  return CI;
 }
diff --git a/lib/Frontend/FrontendAction.cpp b/lib/Frontend/FrontendAction.cpp
index e871b310302d..39fc1371a9ef 100644
--- a/lib/Frontend/FrontendAction.cpp
+++ b/lib/Frontend/FrontendAction.cpp
@@ -224,7 +224,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
     // file, otherwise the CompilerInstance will happily destroy them.
     CI.setFileManager(&AST->getFileManager());
     CI.setSourceManager(&AST->getSourceManager());
-    CI.setPreprocessor(&AST->getPreprocessor());
+    CI.setPreprocessor(AST->getPreprocessorPtr());
     CI.setASTContext(&AST->getASTContext());
 
     setCurrentInput(Input, std::move(AST));
diff --git a/lib/Frontend/SerializedDiagnosticPrinter.cpp b/lib/Frontend/SerializedDiagnosticPrinter.cpp
index 1ea5a342e1d8..7f88c919e24a 100644
--- a/lib/Frontend/SerializedDiagnosticPrinter.cpp
+++ b/lib/Frontend/SerializedDiagnosticPrinter.cpp
@@ -143,7 +143,7 @@ class SDiagsWriter : public DiagnosticConsumer {
 
   struct SharedState;
 
-  explicit SDiagsWriter(IntrusiveRefCntPtr<SharedState> State)
+  explicit SDiagsWriter(std::shared_ptr<SharedState> State)
       : LangOpts(nullptr), OriginalInstance(false), MergeChildRecords(false),
         State(std::move(State)) {}
 
@@ -151,7 +151,7 @@ public:
   SDiagsWriter(StringRef File, DiagnosticOptions *Diags, bool MergeChildRecords)
       : LangOpts(nullptr), OriginalInstance(true),
         MergeChildRecords(MergeChildRecords),
-        State(new SharedState(File, Diags)) {
+        State(std::make_shared<SharedState>(File, Diags)) {
     if (MergeChildRecords)
       RemoveOldDiagnostics();
     EmitPreamble();
@@ -251,7 +251,7 @@ private:
 
   /// \brief State that is shared among the various clones of this diagnostic
   /// consumer.
-  struct SharedState : RefCountedBase<SharedState> {
+  struct SharedState {
     SharedState(StringRef File, DiagnosticOptions *Diags)
         : DiagOpts(Diags), Stream(Buffer), OutputFile(File.str()),
           EmittedAnyDiagBlocks(false) {}
@@ -299,7 +299,7 @@ private:
   };
 
   /// \brief State shared among the various clones of this diagnostic consumer.
-  IntrusiveRefCntPtr<SharedState> State;
+  std::shared_ptr<SharedState> State;
 };
 } // end anonymous namespace
 
@@ -422,15 +422,15 @@ void SDiagsWriter::EmitPreamble() {
   EmitMetaBlock();
 }
 
-static void AddSourceLocationAbbrev(llvm::BitCodeAbbrev *Abbrev) {
+static void AddSourceLocationAbbrev(llvm::BitCodeAbbrev &Abbrev) {
   using namespace llvm;
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10)); // File ID.
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Line.
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Column.
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Offset;
+  Abbrev.Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10)); // File ID.
+  Abbrev.Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Line.
+  Abbrev.Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Column.
+  Abbrev.Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Offset;
 }
 
-static void AddRangeLocationAbbrev(llvm::BitCodeAbbrev *Abbrev) {
+static void AddRangeLocationAbbrev(llvm::BitCodeAbbrev &Abbrev) {
   AddSourceLocationAbbrev(Abbrev);
   AddSourceLocationAbbrev(Abbrev);  
 }
@@ -449,7 +449,7 @@ void SDiagsWriter::EmitBlockInfoBlock() {
 
   EmitBlockID(BLOCK_META, "Meta", Stream, Record);
   EmitRecordID(RECORD_VERSION, "Version", Stream, Record);
-  BitCodeAbbrev *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(RECORD_VERSION));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbrevs.set(RECORD_VERSION, Stream.EmitBlockInfoAbbrev(BLOCK_META, Abbrev));
@@ -467,10 +467,10 @@ void SDiagsWriter::EmitBlockInfoBlock() {
   EmitRecordID(RECORD_FIXIT, "FixIt", Stream, Record);
 
   // Emit abbreviation for RECORD_DIAG.
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(RECORD_DIAG));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3));  // Diag level.
-  AddSourceLocationAbbrev(Abbrev);
+  AddSourceLocationAbbrev(*Abbrev);
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10)); // Category.  
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10)); // Mapped Diag ID.
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 16)); // Text size.
@@ -478,7 +478,7 @@ void SDiagsWriter::EmitBlockInfoBlock() {
   Abbrevs.set(RECORD_DIAG, Stream.EmitBlockInfoAbbrev(BLOCK_DIAG, Abbrev));
   
   // Emit abbrevation for RECORD_CATEGORY.
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(RECORD_CATEGORY));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 16)); // Category ID.
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));  // Text size.
@@ -486,14 +486,14 @@ void SDiagsWriter::EmitBlockInfoBlock() {
   Abbrevs.set(RECORD_CATEGORY, Stream.EmitBlockInfoAbbrev(BLOCK_DIAG, Abbrev));
 
   // Emit abbrevation for RECORD_SOURCE_RANGE.
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(RECORD_SOURCE_RANGE));
-  AddRangeLocationAbbrev(Abbrev);
+  AddRangeLocationAbbrev(*Abbrev);
   Abbrevs.set(RECORD_SOURCE_RANGE,
               Stream.EmitBlockInfoAbbrev(BLOCK_DIAG, Abbrev));
   
   // Emit the abbreviation for RECORD_DIAG_FLAG.
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(RECORD_DIAG_FLAG));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10)); // Mapped Diag ID.
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 16)); // Text size.
@@ -502,7 +502,7 @@ void SDiagsWriter::EmitBlockInfoBlock() {
                                                            Abbrev));
   
   // Emit the abbreviation for RECORD_FILENAME.
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(RECORD_FILENAME));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10)); // Mapped file ID.
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Size.
@@ -513,9 +513,9 @@ void SDiagsWriter::EmitBlockInfoBlock() {
                                                           Abbrev));
   
   // Emit the abbreviation for RECORD_FIXIT.
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(RECORD_FIXIT));
-  AddRangeLocationAbbrev(Abbrev);
+  AddRangeLocationAbbrev(*Abbrev);
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 16)); // Text size.
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));      // FixIt text.
   Abbrevs.set(RECORD_FIXIT, Stream.EmitBlockInfoAbbrev(BLOCK_DIAG,
diff --git a/lib/Frontend/TestModuleFileExtension.cpp b/lib/Frontend/TestModuleFileExtension.cpp
index b43d45f7ae46..294f7e44cee5 100644
--- a/lib/Frontend/TestModuleFileExtension.cpp
+++ b/lib/Frontend/TestModuleFileExtension.cpp
@@ -24,11 +24,11 @@ void TestModuleFileExtension::Writer::writeExtensionContents(
   using namespace llvm;
 
   // Write an abbreviation for this record.
-  BitCodeAbbrev *Abv = new llvm::BitCodeAbbrev();
+  auto Abv = std::make_shared<llvm::BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(FIRST_EXTENSION_RECORD_ID));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // # of characters
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));   // message
-  auto Abbrev = Stream.EmitAbbrev(Abv);
+  auto Abbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Write a message into the extension block.
   SmallString<64> Message;
diff --git a/lib/Headers/__clang_cuda_cmath.h b/lib/Headers/__clang_cuda_cmath.h
index 0eaa08b30cab..9bef82611aa4 100644
--- a/lib/Headers/__clang_cuda_cmath.h
+++ b/lib/Headers/__clang_cuda_cmath.h
@@ -72,6 +72,10 @@ __DEVICE__ int fpclassify(double __x) {
 __DEVICE__ float frexp(float __arg, int *__exp) {
   return ::frexpf(__arg, __exp);
 }
+
+// For inscrutable reasons, the CUDA headers define these functions for us on
+// Windows.
+#ifndef _MSC_VER
 __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
 __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
@@ -79,6 +83,10 @@ __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
 // __finitef, does not exist when compiling for MacOS.  __isfinited is available
 // everywhere and is just as good.
 __DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
+__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+#endif
+
 __DEVICE__ bool isgreater(float __x, float __y) {
   return __builtin_isgreater(__x, __y);
 }
@@ -109,8 +117,6 @@ __DEVICE__ bool islessgreater(float __x, float __y) {
 __DEVICE__ bool islessgreater(double __x, double __y) {
   return __builtin_islessgreater(__x, __y);
 }
-__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
 __DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
 __DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
 __DEVICE__ bool isunordered(float __x, float __y) {
diff --git a/lib/Headers/__clang_cuda_intrinsics.h b/lib/Headers/__clang_cuda_intrinsics.h
index 3df41fa290d3..b43ce21d0bb3 100644
--- a/lib/Headers/__clang_cuda_intrinsics.h
+++ b/lib/Headers/__clang_cuda_intrinsics.h
@@ -35,50 +35,50 @@
 
 #pragma push_macro("__MAKE_SHUFFLES")
 #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask)    \
-  inline __device__ int __FnName(int __in, int __offset,                       \
+  inline __device__ int __FnName(int __val, int __offset,                      \
                                  int __width = warpSize) {                     \
-    return __IntIntrinsic(__in, __offset,                                      \
+    return __IntIntrinsic(__val, __offset,                                     \
                           ((warpSize - __width) << 8) | (__Mask));             \
   }                                                                            \
-  inline __device__ float __FnName(float __in, int __offset,                   \
+  inline __device__ float __FnName(float __val, int __offset,                  \
                                    int __width = warpSize) {                   \
-    return __FloatIntrinsic(__in, __offset,                                    \
+    return __FloatIntrinsic(__val, __offset,                                   \
                             ((warpSize - __width) << 8) | (__Mask));           \
   }                                                                            \
-  inline __device__ unsigned int __FnName(unsigned int __in, int __offset,     \
+  inline __device__ unsigned int __FnName(unsigned int __val, int __offset,    \
                                           int __width = warpSize) {            \
     return static_cast<unsigned int>(                                          \
-        ::__FnName(static_cast<int>(__in), __offset, __width));                \
+        ::__FnName(static_cast<int>(__val), __offset, __width));               \
   }                                                                            \
-  inline __device__ long long __FnName(long long __in, int __offset,           \
+  inline __device__ long long __FnName(long long __val, int __offset,          \
                                        int __width = warpSize) {               \
     struct __Bits {                                                            \
       int __a, __b;                                                            \
     };                                                                         \
-    _Static_assert(sizeof(__in) == sizeof(__Bits));                            \
+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
     _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
     __Bits __tmp;                                                              \
-    memcpy(&__in, &__tmp, sizeof(__in));                                       \
+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
     __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
     __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
-    long long __out;                                                           \
-    memcpy(&__out, &__tmp, sizeof(__tmp));                                     \
-    return __out;                                                              \
+    long long __ret;                                                           \
+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
+    return __ret;                                                              \
   }                                                                            \
   inline __device__ unsigned long long __FnName(                               \
-      unsigned long long __in, int __offset, int __width = warpSize) {         \
-    return static_cast<unsigned long long>(                                    \
-        ::__FnName(static_cast<unsigned long long>(__in), __offset, __width)); \
+      unsigned long long __val, int __offset, int __width = warpSize) {        \
+    return static_cast<unsigned long long>(::__FnName(                         \
+        static_cast<unsigned long long>(__val), __offset, __width));           \
   }                                                                            \
-  inline __device__ double __FnName(double __in, int __offset,                 \
+  inline __device__ double __FnName(double __val, int __offset,                \
                                     int __width = warpSize) {                  \
     long long __tmp;                                                           \
-    _Static_assert(sizeof(__tmp) == sizeof(__in));                             \
-    memcpy(&__tmp, &__in, sizeof(__in));                                       \
+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
     __tmp = ::__FnName(__tmp, __offset, __width);                              \
-    double __out;                                                              \
-    memcpy(&__out, &__tmp, sizeof(__out));                                     \
-    return __out;                                                              \
+    double __ret;                                                              \
+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
+    return __ret;                                                              \
   }
 
 __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h
index d1d1d8026325..a8618816d5bb 100644
--- a/lib/Headers/altivec.h
+++ b/lib/Headers/altivec.h
@@ -12574,6 +12574,9 @@ static __inline__ float __ATTRS_o_ai vec_extract(vector float __a, int __b) {
 
 #ifdef __POWER9_VECTOR__
 
+#define vec_insert4b __builtin_vsx_insertword
+#define vec_extract4b __builtin_vsx_extractuword
+
 /* vec_extract_exp */
 
 static __inline__ vector unsigned int __ATTRS_o_ai
diff --git a/lib/Headers/intrin.h b/lib/Headers/intrin.h
index 7c91ebaee8cb..a35262af846a 100644
--- a/lib/Headers/intrin.h
+++ b/lib/Headers/intrin.h
@@ -65,7 +65,6 @@ static __inline__
 void __cpuid(int[4], int);
 static __inline__
 void __cpuidex(int[4], int, int);
-void __debugbreak(void);
 static __inline__
 __int64 __emul(int, int);
 static __inline__
@@ -109,10 +108,6 @@ void __outdword(unsigned short, unsigned long);
 void __outdwordstring(unsigned short, unsigned long *, unsigned long);
 void __outword(unsigned short, unsigned short);
 void __outwordstring(unsigned short, unsigned short *, unsigned long);
-static __inline__
-unsigned int __popcnt(unsigned int);
-static __inline__
-unsigned short __popcnt16(unsigned short);
 unsigned long __readcr0(void);
 unsigned long __readcr2(void);
 static __inline__
@@ -124,8 +119,6 @@ unsigned int __readdr(unsigned int);
 static __inline__
 unsigned char __readfsbyte(unsigned long);
 static __inline__
-unsigned long __readfsdword(unsigned long);
-static __inline__
 unsigned __int64 __readfsqword(unsigned long);
 static __inline__
 unsigned short __readfsword(unsigned long);
@@ -179,108 +172,34 @@ static __inline__
 unsigned char _bittestandreset(long *, long);
 static __inline__
 unsigned char _bittestandset(long *, long);
-unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
-unsigned long __cdecl _byteswap_ulong(unsigned long);
-unsigned short __cdecl _byteswap_ushort(unsigned short);
 void __cdecl _disable(void);
 void __cdecl _enable(void);
 long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
-static __inline__
-long _InterlockedAnd(long volatile *_Value, long _Mask);
-static __inline__
-short _InterlockedAnd16(short volatile *_Value, short _Mask);
-static __inline__
-char _InterlockedAnd8(char volatile *_Value, char _Mask);
 unsigned char _interlockedbittestandreset(long volatile *, long);
 static __inline__
 unsigned char _interlockedbittestandset(long volatile *, long);
-static __inline__
-long __cdecl _InterlockedCompareExchange(long volatile *_Destination,
-                                         long _Exchange, long _Comparand);
 long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
 long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
-static __inline__
-short _InterlockedCompareExchange16(short volatile *_Destination,
-                                    short _Exchange, short _Comparand);
-static __inline__
-__int64 _InterlockedCompareExchange64(__int64 volatile *_Destination,
-                                      __int64 _Exchange, __int64 _Comparand);
 __int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
                                                  __int64);
 __int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
                                                  __int64);
-static __inline__
-char _InterlockedCompareExchange8(char volatile *_Destination, char _Exchange,
-                                  char _Comparand);
 void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
                                                     void *);
 void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
                                                     void *);
-static __inline__
-long __cdecl _InterlockedDecrement(long volatile *_Addend);
-static __inline__
-short _InterlockedDecrement16(short volatile *_Addend);
-long _InterlockedExchange(long volatile *_Target, long _Value);
-static __inline__
-short _InterlockedExchange16(short volatile *_Target, short _Value);
-static __inline__
-char _InterlockedExchange8(char volatile *_Target, char _Value);
-static __inline__
-long __cdecl _InterlockedExchangeAdd(long volatile *_Addend, long _Value);
 long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
 long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
-static __inline__
-short _InterlockedExchangeAdd16(short volatile *_Addend, short _Value);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
-static __inline__
-char _InterlockedExchangeAdd8(char volatile *_Addend, char _Value);
-static __inline__
-long __cdecl _InterlockedIncrement(long volatile *_Addend);
-static __inline__
-short _InterlockedIncrement16(short volatile *_Addend);
-static __inline__
-long _InterlockedOr(long volatile *_Value, long _Mask);
-static __inline__
-short _InterlockedOr16(short volatile *_Value, short _Mask);
-static __inline__
-char _InterlockedOr8(char volatile *_Value, char _Mask);
-static __inline__
-long _InterlockedXor(long volatile *_Value, long _Mask);
-static __inline__
-short _InterlockedXor16(short volatile *_Value, short _Mask);
-static __inline__
-char _InterlockedXor8(char volatile *_Value, char _Mask);
 void __cdecl _invpcid(unsigned int, void *);
-static __inline__
-unsigned long __cdecl _lrotl(unsigned long, int);
-static __inline__
-unsigned long __cdecl _lrotr(unsigned long, int);
 static __inline__ void
 __attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
 _ReadBarrier(void);
 static __inline__ void
 __attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
 _ReadWriteBarrier(void);
-static __inline__
-void *_ReturnAddress(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
-static __inline__
-unsigned int __cdecl _rotl(unsigned int _Value, int _Shift);
-static __inline__
-unsigned short _rotl16(unsigned short _Value, unsigned char _Shift);
-static __inline__
-unsigned __int64 __cdecl _rotl64(unsigned __int64 _Value, int _Shift);
-static __inline__
-unsigned char _rotl8(unsigned char _Value, unsigned char _Shift);
-static __inline__
-unsigned int __cdecl _rotr(unsigned int _Value, int _Shift);
-static __inline__
-unsigned short _rotr16(unsigned short _Value, unsigned char _Shift);
-static __inline__
-unsigned __int64 __cdecl _rotr64(unsigned __int64 _Value, int _Shift);
-static __inline__
-unsigned char _rotr8(unsigned char _Value, unsigned char _Shift);
 int _sarx_i32(int, unsigned int);
 #if __STDC_HOSTED__
 int __cdecl _setjmp(jmp_buf);
@@ -318,8 +237,6 @@ unsigned __int64 __lzcnt64(unsigned __int64);
 static __inline__
 void __movsq(unsigned long long *, unsigned long long const *, size_t);
 static __inline__
-unsigned __int64 __popcnt64(unsigned __int64);
-static __inline__
 unsigned char __readgsbyte(unsigned long);
 static __inline__
 unsigned long __readgsdword(unsigned long);
@@ -357,7 +274,6 @@ static __inline__
 unsigned char _bittestandreset64(__int64 *, __int64);
 static __inline__
 unsigned char _bittestandset64(__int64 *, __int64);
-unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
 long _InterlockedAnd_np(long volatile *_Value, long _Mask);
 short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
 __int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
@@ -383,11 +299,8 @@ __int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
                                                  __int64);
 __int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
                                          __int64 _Exchange, __int64 _Comparand);
-void *_InterlockedCompareExchangePointer(void *volatile *_Destination,
-                                         void *_Exchange, void *_Comparand);
 void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
                                             void *_Exchange, void *_Comparand);
-void *_InterlockedExchangePointer(void *volatile *_Target, void *_Value);
 long _InterlockedOr_np(long volatile *_Value, long _Mask);
 short _InterlockedOr16_np(short volatile *_Value, short _Mask);
 __int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
@@ -398,9 +311,6 @@ __int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
 char _InterlockedXor8_np(char volatile *_Value, char _Mask);
 unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
 __int64 _sarx_i64(__int64, unsigned int);
-#if __STDC_HOSTED__
-int __cdecl _setjmpex(jmp_buf);
-#endif
 unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
 unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
 static __inline__
diff --git a/lib/Lex/HeaderSearch.cpp b/lib/Lex/HeaderSearch.cpp
index b5228fc6c8cb..fa2a76ef47ca 100644
--- a/lib/Lex/HeaderSearch.cpp
+++ b/lib/Lex/HeaderSearch.cpp
@@ -54,7 +54,7 @@ HeaderFileInfo::getControllingMacro(ExternalPreprocessorSource *External) {
 
 ExternalHeaderFileInfoSource::~ExternalHeaderFileInfoSource() {}
 
-HeaderSearch::HeaderSearch(IntrusiveRefCntPtr<HeaderSearchOptions> HSOpts,
+HeaderSearch::HeaderSearch(std::shared_ptr<HeaderSearchOptions> HSOpts,
                            SourceManager &SourceMgr, DiagnosticsEngine &Diags,
                            const LangOptions &LangOpts,
                            const TargetInfo *Target)
diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp
index 0f7473b8c1ff..91319bedd6f0 100644
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -68,7 +68,7 @@ LLVM_INSTANTIATE_REGISTRY(PragmaHandlerRegistry)
 //===----------------------------------------------------------------------===//
 ExternalPreprocessorSource::~ExternalPreprocessorSource() { }
 
-Preprocessor::Preprocessor(IntrusiveRefCntPtr<PreprocessorOptions> PPOpts,
+Preprocessor::Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts,
                            DiagnosticsEngine &diags, LangOptions &opts,
                            SourceManager &SM, HeaderSearch &Headers,
                            ModuleLoader &TheModuleLoader,
diff --git a/lib/Parse/ParseDecl.cpp b/lib/Parse/ParseDecl.cpp
index ad4005747310..ba24adefe6b0 100644
--- a/lib/Parse/ParseDecl.cpp
+++ b/lib/Parse/ParseDecl.cpp
@@ -177,8 +177,12 @@ void Parser::ParseGNUAttributes(ParsedAttributes &attrs,
       if (!ClassStack.empty() && !LateAttrs->parseSoon())
         getCurrentClass().LateParsedDeclarations.push_back(LA);
 
-      // consume everything up to and including the matching right parens
-      ConsumeAndStoreUntil(tok::r_paren, LA->Toks, true, false);
+      // Be sure ConsumeAndStoreUntil doesn't see the start l_paren, since it
+      // recursively consumes balanced parens.
+      LA->Toks.push_back(Tok);
+      ConsumeParen();
+      // Consume everything up to and including the matching right parens.
+      ConsumeAndStoreUntil(tok::r_paren, LA->Toks, /*StopAtSemi=*/true);
 
       Token Eof;
       Eof.startToken();
diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp
index caf2320f8fc1..55b5ff498574 100644
--- a/lib/Parse/ParseExpr.cpp
+++ b/lib/Parse/ParseExpr.cpp
@@ -2751,6 +2751,7 @@ void Parser::ParseBlockId(SourceLocation CaretLoc) {
 
   // Parse the block-declarator.
   Declarator DeclaratorInfo(DS, Declarator::BlockLiteralContext);
+  DeclaratorInfo.setFunctionDefinitionKind(FDK_Definition);
   ParseDeclarator(DeclaratorInfo);
 
   MaybeParseGNUAttributes(DeclaratorInfo);
@@ -2789,6 +2790,7 @@ ExprResult Parser::ParseBlockLiteralExpression() {
   // Parse the return type if present.
   DeclSpec DS(AttrFactory);
   Declarator ParamInfo(DS, Declarator::BlockLiteralContext);
+  ParamInfo.setFunctionDefinitionKind(FDK_Definition);
   // FIXME: Since the return type isn't actually parsed, it can't be used to
   // fill ParamInfo with an initial valid range, so do it manually.
   ParamInfo.SetSourceRange(SourceRange(Tok.getLocation(), Tok.getLocation()));
diff --git a/lib/Parse/ParsePragma.cpp b/lib/Parse/ParsePragma.cpp
index 2dc6a0739bc8..89733237c153 100644
--- a/lib/Parse/ParsePragma.cpp
+++ b/lib/Parse/ParsePragma.cpp
@@ -506,10 +506,12 @@ void Parser::HandlePragmaOpenCLExtension() {
   // overriding all previously issued extension directives, but only if the
   // behavior is set to disable."
   if (Name == "all") {
-    if (State == Disable)
+    if (State == Disable) {
       Opt.disableAll();
-    else
+      Opt.enableSupportedCore(getLangOpts().OpenCLVersion);
+    } else {
       PP.Diag(NameLoc, diag::warn_pragma_expected_predicate) << 1;
+    }
   } else if (State == Begin) {
     if (!Opt.isKnown(Name) ||
         !Opt.isSupported(Name, getLangOpts().OpenCLVersion)) {
diff --git a/lib/Sema/SemaCodeComplete.cpp b/lib/Sema/SemaCodeComplete.cpp
index 3eef366b75b3..94cfc4baca51 100644
--- a/lib/Sema/SemaCodeComplete.cpp
+++ b/lib/Sema/SemaCodeComplete.cpp
@@ -3720,9 +3720,17 @@ static void AddObjCProperties(
       Builder.AddPlaceholderChunk(
           Builder.getAllocator().CopyString(PlaceholderStr));
 
+      // When completing blocks properties that return void the default
+      // property completion result should show up before the setter,
+      // otherwise the setter completion should show up before the default
+      // property completion, as we normally want to use the result of the
+      // call.
       Results.MaybeAddResult(
           Result(Builder.TakeString(), P,
-                 Results.getBasePriority(P) + CCD_BlockPropertySetter),
+                 Results.getBasePriority(P) +
+                     (BlockLoc.getTypePtr()->getReturnType()->isVoidType()
+                          ? CCD_BlockPropertySetter
+                          : -CCD_BlockPropertySetter)),
           CurContext);
     }
   };
diff --git a/lib/Sema/SemaDeclCXX.cpp b/lib/Sema/SemaDeclCXX.cpp
index 084bd4c45eda..a650621b573a 100644
--- a/lib/Sema/SemaDeclCXX.cpp
+++ b/lib/Sema/SemaDeclCXX.cpp
@@ -5395,6 +5395,26 @@ static void ReferenceDllExportedMethods(Sema &S, CXXRecordDecl *Class) {
   }
 }
 
+static void checkForMultipleExportedDefaultConstructors(Sema &S, CXXRecordDecl *Class) {
+  CXXConstructorDecl *LastExportedDefaultCtor = nullptr;
+  for (Decl *Member : Class->decls()) {
+    // Look for exported default constructors.
+    auto *CD = dyn_cast<CXXConstructorDecl>(Member);
+    if (!CD || !CD->isDefaultConstructor() || !CD->hasAttr<DLLExportAttr>())
+      continue;
+
+    if (LastExportedDefaultCtor) {
+      S.Diag(LastExportedDefaultCtor->getLocation(),
+             diag::err_attribute_dll_ambiguous_default_ctor)
+          << Class;
+      S.Diag(CD->getLocation(), diag::note_entity_declared_at)
+          << CD->getDeclName();
+      return;
+    }
+    LastExportedDefaultCtor = CD;
+  }
+}
+
 /// \brief Check class-level dllimport/dllexport attribute.
 void Sema::checkClassLevelDLLAttribute(CXXRecordDecl *Class) {
   Attr *ClassAttr = getDLLAttr(Class);
@@ -10362,64 +10382,11 @@ void Sema::ActOnFinishCXXMemberDecls() {
       DelayedExceptionSpecChecks.clear();
       return;
     }
-  }
-}
-
-static void checkDefaultArgExprsForConstructors(Sema &S, CXXRecordDecl *Class) {
-  // Don't do anything for template patterns.
-  if (Class->getDescribedClassTemplate())
-    return;
-
-  CallingConv ExpectedCallingConv = S.Context.getDefaultCallingConvention(
-      /*IsVariadic=*/false, /*IsCXXMethod=*/true);
-
-  CXXConstructorDecl *LastExportedDefaultCtor = nullptr;
-  for (Decl *Member : Class->decls()) {
-    auto *CD = dyn_cast<CXXConstructorDecl>(Member);
-    if (!CD) {
-      // Recurse on nested classes.
-      if (auto *NestedRD = dyn_cast<CXXRecordDecl>(Member))
-        checkDefaultArgExprsForConstructors(S, NestedRD);
-      continue;
-    } else if (!CD->isDefaultConstructor() || !CD->hasAttr<DLLExportAttr>()) {
-      continue;
-    }
-
-    CallingConv ActualCallingConv =
-        CD->getType()->getAs<FunctionProtoType>()->getCallConv();
-
-    // Skip default constructors with typical calling conventions and no default
-    // arguments.
-    unsigned NumParams = CD->getNumParams();
-    if (ExpectedCallingConv == ActualCallingConv && NumParams == 0)
-      continue;
-
-    if (LastExportedDefaultCtor) {
-      S.Diag(LastExportedDefaultCtor->getLocation(),
-             diag::err_attribute_dll_ambiguous_default_ctor) << Class;
-      S.Diag(CD->getLocation(), diag::note_entity_declared_at)
-          << CD->getDeclName();
-      return;
-    }
-    LastExportedDefaultCtor = CD;
-
-    for (unsigned I = 0; I != NumParams; ++I) {
-      (void)S.CheckCXXDefaultArgExpr(Class->getLocation(), CD,
-                                     CD->getParamDecl(I));
-      S.DiscardCleanupsInEvaluationContext();
-    }
+    checkForMultipleExportedDefaultConstructors(*this, Record);
   }
 }
 
 void Sema::ActOnFinishCXXNonNestedClass(Decl *D) {
-  auto *RD = dyn_cast<CXXRecordDecl>(D);
-
-  // Default constructors that are annotated with __declspec(dllexport) which
-  // have default arguments or don't use the standard calling convention are
-  // wrapped with a thunk called the default constructor closure.
-  if (RD && Context.getTargetInfo().getCXXABI().isMicrosoft())
-    checkDefaultArgExprsForConstructors(*this, RD);
-
   referenceDLLExportedClassMethods();
 }
 
diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index 3c554c9a5244..1509b22a9e5a 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp
@@ -2777,6 +2777,9 @@ bool Sema::UseArgumentDependentLookup(const CXXScopeSpec &SS,
 /// were not overloaded, and it doesn't promise that the declaration
 /// will in fact be used.
 static bool CheckDeclInExpr(Sema &S, SourceLocation Loc, NamedDecl *D) {
+  if (D->isInvalidDecl())
+    return true;
+
   if (isa<TypedefNameDecl>(D)) {
     S.Diag(Loc, diag::err_unexpected_typedef) << D->getDeclName();
     return true;
diff --git a/lib/Sema/SemaExprCXX.cpp b/lib/Sema/SemaExprCXX.cpp
index 5f769cc40ded..1379440e8a03 100644
--- a/lib/Sema/SemaExprCXX.cpp
+++ b/lib/Sema/SemaExprCXX.cpp
@@ -7262,6 +7262,8 @@ public:
     while (TypoCorrection TC = State.Consumer->getNextCorrection()) {
       if (InitDecl && TC.getFoundDecl() == InitDecl)
         continue;
+      // FIXME: If we would typo-correct to an invalid declaration, it's
+      // probably best to just suppress all errors from this typo correction.
       ExprResult NE = State.RecoveryHandler ?
           State.RecoveryHandler(SemaRef, E, TC) :
           attemptRecovery(SemaRef, *State.Consumer, TC);
diff --git a/lib/Sema/SemaOverload.cpp b/lib/Sema/SemaOverload.cpp
index 1c026d7adb36..33574b9aec35 100644
--- a/lib/Sema/SemaOverload.cpp
+++ b/lib/Sema/SemaOverload.cpp
@@ -604,7 +604,8 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
     Result.Data = Info.Param.getOpaqueValue();
     break;
 
-  case Sema::TDK_DeducedMismatch: {
+  case Sema::TDK_DeducedMismatch:
+  case Sema::TDK_DeducedMismatchNested: {
     // FIXME: Should allocate from normal heap so that we can free this later.
     auto *Saved = new (Context) DFIDeducedMismatchArgs;
     Saved->FirstArg = Info.FirstArg;
@@ -664,6 +665,7 @@ void DeductionFailureInfo::Destroy() {
   case Sema::TDK_Inconsistent:
   case Sema::TDK_Underqualified:
   case Sema::TDK_DeducedMismatch:
+  case Sema::TDK_DeducedMismatchNested:
   case Sema::TDK_NonDeducedMismatch:
     // FIXME: Destroy the data?
     Data = nullptr;
@@ -699,6 +701,7 @@ TemplateParameter DeductionFailureInfo::getTemplateParameter() {
   case Sema::TDK_TooFewArguments:
   case Sema::TDK_SubstitutionFailure:
   case Sema::TDK_DeducedMismatch:
+  case Sema::TDK_DeducedMismatchNested:
   case Sema::TDK_NonDeducedMismatch:
   case Sema::TDK_CUDATargetMismatch:
     return TemplateParameter();
@@ -735,6 +738,7 @@ TemplateArgumentList *DeductionFailureInfo::getTemplateArgumentList() {
     return nullptr;
 
   case Sema::TDK_DeducedMismatch:
+  case Sema::TDK_DeducedMismatchNested:
     return static_cast<DFIDeducedMismatchArgs*>(Data)->TemplateArgs;
 
   case Sema::TDK_SubstitutionFailure:
@@ -764,6 +768,7 @@ const TemplateArgument *DeductionFailureInfo::getFirstArg() {
   case Sema::TDK_Inconsistent:
   case Sema::TDK_Underqualified:
   case Sema::TDK_DeducedMismatch:
+  case Sema::TDK_DeducedMismatchNested:
   case Sema::TDK_NonDeducedMismatch:
     return &static_cast<DFIArguments*>(Data)->FirstArg;
 
@@ -791,6 +796,7 @@ const TemplateArgument *DeductionFailureInfo::getSecondArg() {
   case Sema::TDK_Inconsistent:
   case Sema::TDK_Underqualified:
   case Sema::TDK_DeducedMismatch:
+  case Sema::TDK_DeducedMismatchNested:
   case Sema::TDK_NonDeducedMismatch:
     return &static_cast<DFIArguments*>(Data)->SecondArg;
 
@@ -803,11 +809,14 @@ const TemplateArgument *DeductionFailureInfo::getSecondArg() {
 }
 
 llvm::Optional<unsigned> DeductionFailureInfo::getCallArgIndex() {
-  if (static_cast<Sema::TemplateDeductionResult>(Result) ==
-        Sema::TDK_DeducedMismatch)
+  switch (static_cast<Sema::TemplateDeductionResult>(Result)) {
+  case Sema::TDK_DeducedMismatch:
+  case Sema::TDK_DeducedMismatchNested:
     return static_cast<DFIDeducedMismatchArgs*>(Data)->CallArgIndex;
 
-  return llvm::None;
+  default:
+    return llvm::None;
+  }
 }
 
 void OverloadCandidateSet::destroyCandidates() {
@@ -9682,7 +9691,8 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     return;
   }
 
-  case Sema::TDK_DeducedMismatch: {
+  case Sema::TDK_DeducedMismatch:
+  case Sema::TDK_DeducedMismatchNested: {
     // Format the template argument list into the argument string.
     SmallString<128> TemplateArgString;
     if (TemplateArgumentList *Args =
@@ -9695,7 +9705,8 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
     S.Diag(Templated->getLocation(), diag::note_ovl_candidate_deduced_mismatch)
         << (*DeductionFailure.getCallArgIndex() + 1)
         << *DeductionFailure.getFirstArg() << *DeductionFailure.getSecondArg()
-        << TemplateArgString;
+        << TemplateArgString
+        << (DeductionFailure.Result == Sema::TDK_DeducedMismatchNested);
     break;
   }
 
@@ -10012,6 +10023,7 @@ static unsigned RankDeductionFailure(const DeductionFailureInfo &DFI) {
 
   case Sema::TDK_SubstitutionFailure:
   case Sema::TDK_DeducedMismatch:
+  case Sema::TDK_DeducedMismatchNested:
   case Sema::TDK_NonDeducedMismatch:
   case Sema::TDK_MiscellaneousDeductionFailure:
   case Sema::TDK_CUDATargetMismatch:
diff --git a/lib/Sema/SemaTemplateDeduction.cpp b/lib/Sema/SemaTemplateDeduction.cpp
index c16b28bcf139..b79904c0a703 100644
--- a/lib/Sema/SemaTemplateDeduction.cpp
+++ b/lib/Sema/SemaTemplateDeduction.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/StmtVisitor.h"
+#include "clang/AST/TypeOrdering.h"
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/Template.h"
@@ -1899,8 +1900,9 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
 
       // Check whether we have enough arguments.
       if (!hasTemplateArgumentForDeduction(Args, ArgIdx))
-        return NumberOfArgumentsMustMatch ? Sema::TDK_TooFewArguments
-                                          : Sema::TDK_Success;
+        return NumberOfArgumentsMustMatch
+                   ? Sema::TDK_MiscellaneousDeductionFailure
+                   : Sema::TDK_Success;
 
       // C++1z [temp.deduct.type]p9:
       //   During partial ordering, if Ai was originally a pack expansion [and]
@@ -2214,25 +2216,26 @@ static Sema::TemplateDeductionResult ConvertDeducedTemplateArguments(
 
     if (!Deduced[I].isNull()) {
       if (I < NumAlreadyConverted) {
-        // We have already fully type-checked and converted this
-        // argument, because it was explicitly-specified. Just record the
-        // presence of this argument.
-        Builder.push_back(Deduced[I]);
         // We may have had explicitly-specified template arguments for a
         // template parameter pack (that may or may not have been extended
         // via additional deduced arguments).
-        if (Param->isParameterPack() && CurrentInstantiationScope) {
-          if (CurrentInstantiationScope->getPartiallySubstitutedPack() ==
-              Param) {
-            // Forget the partially-substituted pack; its substitution is now
-            // complete.
-            CurrentInstantiationScope->ResetPartiallySubstitutedPack();
-          }
+        if (Param->isParameterPack() && CurrentInstantiationScope &&
+            CurrentInstantiationScope->getPartiallySubstitutedPack() == Param) {
+          // Forget the partially-substituted pack; its substitution is now
+          // complete.
+          CurrentInstantiationScope->ResetPartiallySubstitutedPack();
+          // We still need to check the argument in case it was extended by
+          // deduction.
+        } else {
+          // We have already fully type-checked and converted this
+          // argument, because it was explicitly-specified. Just record the
+          // presence of this argument.
+          Builder.push_back(Deduced[I]);
+          continue;
         }
-        continue;
       }
 
-      // We have deduced this argument, so it still needs to be
+      // We may have deduced this argument, so it still needs to be
       // checked and converted.
       if (ConvertDeducedTemplateArgument(S, Param, Deduced[I], Template, Info,
                                          IsDeduced, Builder)) {
@@ -2854,6 +2857,36 @@ CheckOriginalCallArgDeduction(Sema &S, Sema::OriginalCallArg OriginalArg,
   return true;
 }
 
+/// Find the pack index for a particular parameter index in an instantiation of
+/// a function template with specific arguments.
+///
+/// \return The pack index for whichever pack produced this parameter, or -1
+///         if this was not produced by a parameter. Intended to be used as the
+///         ArgumentPackSubstitutionIndex for further substitutions.
+// FIXME: We should track this in OriginalCallArgs so we don't need to
+// reconstruct it here.
+static unsigned getPackIndexForParam(Sema &S,
+                                     FunctionTemplateDecl *FunctionTemplate,
+                                     const MultiLevelTemplateArgumentList &Args,
+                                     unsigned ParamIdx) {
+  unsigned Idx = 0;
+  for (auto *PD : FunctionTemplate->getTemplatedDecl()->parameters()) {
+    if (PD->isParameterPack()) {
+      unsigned NumExpansions =
+          S.getNumArgumentsInExpansion(PD->getType(), Args).getValueOr(1);
+      if (Idx + NumExpansions > ParamIdx)
+        return ParamIdx - Idx;
+      Idx += NumExpansions;
+    } else {
+      if (Idx == ParamIdx)
+        return -1; // Not a pack expansion
+      ++Idx;
+    }
+  }
+
+  llvm_unreachable("parameter index would not be produced from template");
+}
+
 /// \brief Finish template argument deduction for a function template,
 /// checking the deduced template arguments for completeness and forming
 /// the function template specialization.
@@ -2904,9 +2937,9 @@ Sema::FinishTemplateArgumentDeduction(FunctionTemplateDecl *FunctionTemplate,
   DeclContext *Owner = FunctionTemplate->getDeclContext();
   if (FunctionTemplate->getFriendObjectKind())
     Owner = FunctionTemplate->getLexicalDeclContext();
+  MultiLevelTemplateArgumentList SubstArgs(*DeducedArgumentList);
   Specialization = cast_or_null<FunctionDecl>(
-                      SubstDecl(FunctionTemplate->getTemplatedDecl(), Owner,
-                         MultiLevelTemplateArgumentList(*DeducedArgumentList)));
+      SubstDecl(FunctionTemplate->getTemplatedDecl(), Owner, SubstArgs));
   if (!Specialization || Specialization->isInvalidDecl())
     return TDK_SubstitutionFailure;
 
@@ -2932,19 +2965,46 @@ Sema::FinishTemplateArgumentDeduction(FunctionTemplateDecl *FunctionTemplate,
     //   In general, the deduction process attempts to find template argument
     //   values that will make the deduced A identical to A (after the type A
     //   is transformed as described above). [...]
+    llvm::SmallDenseMap<std::pair<unsigned, QualType>, QualType> DeducedATypes;
     for (unsigned I = 0, N = OriginalCallArgs->size(); I != N; ++I) {
       OriginalCallArg OriginalArg = (*OriginalCallArgs)[I];
-      unsigned ParamIdx = OriginalArg.ArgIdx;
 
+      auto ParamIdx = OriginalArg.ArgIdx;
       if (ParamIdx >= Specialization->getNumParams())
+        // FIXME: This presumably means a pack ended up smaller than we
+        // expected while deducing. Should this not result in deduction
+        // failure? Can it even happen?
         continue;
 
-      QualType DeducedA = Specialization->getParamDecl(ParamIdx)->getType();
+      QualType DeducedA;
+      if (!OriginalArg.DecomposedParam) {
+        // P is one of the function parameters, just look up its substituted
+        // type.
+        DeducedA = Specialization->getParamDecl(ParamIdx)->getType();
+      } else {
+        // P is a decomposed element of a parameter corresponding to a
+        // braced-init-list argument. Substitute back into P to find the
+        // deduced A.
+        QualType &CacheEntry =
+            DeducedATypes[{ParamIdx, OriginalArg.OriginalParamType}];
+        if (CacheEntry.isNull()) {
+          ArgumentPackSubstitutionIndexRAII PackIndex(
+              *this, getPackIndexForParam(*this, FunctionTemplate, SubstArgs,
+                                          ParamIdx));
+          CacheEntry =
+              SubstType(OriginalArg.OriginalParamType, SubstArgs,
+                        Specialization->getTypeSpecStartLoc(),
+                        Specialization->getDeclName());
+        }
+        DeducedA = CacheEntry;
+      }
+
       if (CheckOriginalCallArgDeduction(*this, OriginalArg, DeducedA)) {
         Info.FirstArg = TemplateArgument(DeducedA);
         Info.SecondArg = TemplateArgument(OriginalArg.OriginalArgType);
         Info.CallArgIndex = OriginalArg.ArgIdx;
-        return TDK_DeducedMismatch;
+        return OriginalArg.DecomposedParam ? TDK_DeducedMismatchNested
+                                           : TDK_DeducedMismatch;
       }
     }
   }
@@ -3196,19 +3256,21 @@ static bool
 hasDeducibleTemplateParameters(Sema &S, FunctionTemplateDecl *FunctionTemplate,
                                QualType T);
 
-static Sema::TemplateDeductionResult DeduceTemplateArgumentByListElement(
+static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
     Sema &S, TemplateParameterList *TemplateParams, QualType ParamType,
     Expr *Arg, TemplateDeductionInfo &Info,
-    SmallVectorImpl<DeducedTemplateArgument> &Deduced, unsigned TDF);
+    SmallVectorImpl<DeducedTemplateArgument> &Deduced,
+    SmallVectorImpl<Sema::OriginalCallArg> &OriginalCallArgs,
+    bool DecomposedParam, unsigned ArgIdx, unsigned TDF);
 
 /// \brief Attempt template argument deduction from an initializer list
 ///        deemed to be an argument in a function call.
-static Sema::TemplateDeductionResult
-DeduceFromInitializerList(Sema &S, TemplateParameterList *TemplateParams,
-                          QualType AdjustedParamType, InitListExpr *ILE,
-                          TemplateDeductionInfo &Info,
-                          SmallVectorImpl<DeducedTemplateArgument> &Deduced,
-                          unsigned TDF) {
+static Sema::TemplateDeductionResult DeduceFromInitializerList(
+    Sema &S, TemplateParameterList *TemplateParams, QualType AdjustedParamType,
+    InitListExpr *ILE, TemplateDeductionInfo &Info,
+    SmallVectorImpl<DeducedTemplateArgument> &Deduced,
+    SmallVectorImpl<Sema::OriginalCallArg> &OriginalCallArgs, unsigned ArgIdx,
+    unsigned TDF) {
   // C++ [temp.deduct.call]p1: (CWG 1591)
   //   If removing references and cv-qualifiers from P gives
   //   std::initializer_list<P0> or P0[N] for some P0 and N and the argument is
@@ -3216,8 +3278,10 @@ DeduceFromInitializerList(Sema &S, TemplateParameterList *TemplateParams,
   //   each element of the initializer list, taking P0 as a function template
   //   parameter type and the initializer element as its argument
   //
-  // FIXME: Remove references and cv-qualifiers here? Consider
-  //          std::initializer_list<std::initializer_list<T>&&>
+  // We've already removed references and cv-qualifiers here.
+  if (!ILE->getNumInits())
+    return Sema::TDK_Success;
+
   QualType ElTy;
   auto *ArrTy = S.Context.getAsArrayType(AdjustedParamType);
   if (ArrTy)
@@ -3231,15 +3295,15 @@ DeduceFromInitializerList(Sema &S, TemplateParameterList *TemplateParams,
   // Deduction only needs to be done for dependent types.
   if (ElTy->isDependentType()) {
     for (Expr *E : ILE->inits()) {
-      if (auto Result = DeduceTemplateArgumentByListElement(
-              S, TemplateParams, ElTy, E, Info, Deduced, TDF))
+      if (auto Result = DeduceTemplateArgumentsFromCallArgument(
+              S, TemplateParams, ElTy, E, Info, Deduced, OriginalCallArgs, true,
+              ArgIdx, TDF))
         return Result;
     }
   }
 
   //   in the P0[N] case, if N is a non-type template parameter, N is deduced
   //   from the length of the initializer list.
-  // FIXME: We're not supposed to get here if N would be deduced as 0.
   if (auto *DependentArrTy = dyn_cast_or_null<DependentSizedArrayType>(ArrTy)) {
     // Determine the array bound is something we can deduce.
     if (NonTypeTemplateParmDecl *NTTP =
@@ -3258,30 +3322,35 @@ DeduceFromInitializerList(Sema &S, TemplateParameterList *TemplateParams,
   return Sema::TDK_Success;
 }
 
-/// \brief Perform template argument deduction by matching a parameter type
-///        against a single expression, where the expression is an element of
-///        an initializer list that was originally matched against a parameter
-///        of type \c initializer_list\<ParamType\>.
-static Sema::TemplateDeductionResult
-DeduceTemplateArgumentByListElement(Sema &S,
-                                    TemplateParameterList *TemplateParams,
-                                    QualType ParamType, Expr *Arg,
-                                    TemplateDeductionInfo &Info,
-                              SmallVectorImpl<DeducedTemplateArgument> &Deduced,
-                                    unsigned TDF) {
-  // Handle the case where an init list contains another init list as the
-  // element.
-  if (InitListExpr *ILE = dyn_cast<InitListExpr>(Arg))
-    return DeduceFromInitializerList(S, TemplateParams,
-                                     ParamType.getNonReferenceType(), ILE, Info,
-                                     Deduced, TDF);
-
-  // For all other cases, just match by type.
+/// \brief Perform template argument deduction per [temp.deduct.call] for a
+///        single parameter / argument pair.
+static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
+    Sema &S, TemplateParameterList *TemplateParams, QualType ParamType,
+    Expr *Arg, TemplateDeductionInfo &Info,
+    SmallVectorImpl<DeducedTemplateArgument> &Deduced,
+    SmallVectorImpl<Sema::OriginalCallArg> &OriginalCallArgs,
+    bool DecomposedParam, unsigned ArgIdx, unsigned TDF) {
   QualType ArgType = Arg->getType();
+  QualType OrigParamType = ParamType;
+
+  //   If P is a reference type [...]
+  //   If P is a cv-qualified type [...]
   if (AdjustFunctionParmAndArgTypesForDeduction(S, TemplateParams, ParamType,
                                                 ArgType, Arg, TDF))
     return Sema::TDK_Success;
 
+  //   If [...] the argument is a non-empty initializer list [...]
+  if (InitListExpr *ILE = dyn_cast<InitListExpr>(Arg))
+    return DeduceFromInitializerList(S, TemplateParams, ParamType, ILE, Info,
+                                     Deduced, OriginalCallArgs, ArgIdx, TDF);
+
+  //   [...] the deduction process attempts to find template argument values
+  //   that will make the deduced A identical to A
+  //
+  // Keep track of the argument type and corresponding parameter index,
+  // so we can check for compatibility between the deduced A and A.
+  OriginalCallArgs.push_back(
+      Sema::OriginalCallArg(OrigParamType, DecomposedParam, ArgIdx, ArgType));
   return DeduceTemplateArgumentsByTypeMatch(S, TemplateParams, ParamType,
                                             ArgType, Info, Deduced, TDF);
 }
@@ -3364,31 +3433,17 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
 
   // Deduce an argument of type ParamType from an expression with index ArgIdx.
   auto DeduceCallArgument = [&](QualType ParamType, unsigned ArgIdx) {
-    Expr *Arg = Args[ArgIdx];
-    QualType ArgType = Arg->getType();
-    QualType OrigParamType = ParamType;
-
-    unsigned TDF = 0;
-    if (AdjustFunctionParmAndArgTypesForDeduction(*this, TemplateParams,
-                                                  ParamType, ArgType, Arg,
-                                                  TDF))
-      return Sema::TDK_Success;
-
-    // If we have nothing to deduce, we're done.
+    // C++ [demp.deduct.call]p1: (DR1391)
+    //   Template argument deduction is done by comparing each function template
+    //   parameter that contains template-parameters that participate in
+    //   template argument deduction ...
     if (!hasDeducibleTemplateParameters(*this, FunctionTemplate, ParamType))
       return Sema::TDK_Success;
 
-    // If the argument is an initializer list ...
-    if (InitListExpr *ILE = dyn_cast<InitListExpr>(Arg))
-      return DeduceFromInitializerList(*this, TemplateParams, ParamType, ILE,
-                                       Info, Deduced, TDF);
-
-    // Keep track of the argument type and corresponding parameter index,
-    // so we can check for compatibility between the deduced A and A.
-    OriginalCallArgs.push_back(OriginalCallArg(OrigParamType, ArgIdx, ArgType));
-
-    return DeduceTemplateArgumentsByTypeMatch(*this, TemplateParams, ParamType,
-                                              ArgType, Info, Deduced, TDF);
+    //   ... with the type of the corresponding argument
+    return DeduceTemplateArgumentsFromCallArgument(
+        *this, TemplateParams, ParamType, Args[ArgIdx], Info, Deduced,
+        OriginalCallArgs, /*Decomposed*/false, ArgIdx, /*TDF*/ 0);
   };
 
   // Deduce template arguments from the function parameters.
@@ -4054,8 +4109,6 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *&Init, QualType &Result,
   // Deduce type of TemplParam in Func(Init)
   SmallVector<DeducedTemplateArgument, 1> Deduced;
   Deduced.resize(1);
-  QualType InitType = Init->getType();
-  unsigned TDF = 0;
 
   TemplateDeductionInfo Info(Loc, Depth);
 
@@ -4070,12 +4123,21 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *&Init, QualType &Result,
     return DAR_Failed;
   };
 
+  SmallVector<OriginalCallArg, 4> OriginalCallArgs;
+
   InitListExpr *InitList = dyn_cast<InitListExpr>(Init);
   if (InitList) {
+    // Notionally, we substitute std::initializer_list<T> for 'auto' and deduce
+    // against that. Such deduction only succeeds if removing cv-qualifiers and
+    // references results in std::initializer_list<T>.
+    if (!Type.getType().getNonReferenceType()->getAs<AutoType>())
+      return DAR_Failed;
+
     for (unsigned i = 0, e = InitList->getNumInits(); i < e; ++i) {
-      if (DeduceTemplateArgumentByListElement(*this, TemplateParamsSt.get(),
-                                              TemplArg, InitList->getInit(i),
-                                              Info, Deduced, TDF))
+      if (DeduceTemplateArgumentsFromCallArgument(
+              *this, TemplateParamsSt.get(), TemplArg, InitList->getInit(i),
+              Info, Deduced, OriginalCallArgs, /*Decomposed*/ true,
+              /*ArgIdx*/ 0, /*TDF*/ 0))
         return DeductionFailed();
     }
   } else {
@@ -4084,13 +4146,9 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *&Init, QualType &Result,
       return DAR_FailedAlreadyDiagnosed;
     }
 
-    if (AdjustFunctionParmAndArgTypesForDeduction(
-            *this, TemplateParamsSt.get(), FuncParam, InitType, Init, TDF))
-      return DAR_Failed;
-
-    if (DeduceTemplateArgumentsByTypeMatch(*this, TemplateParamsSt.get(),
-                                           FuncParam, InitType, Info, Deduced,
-                                           TDF))
+    if (DeduceTemplateArgumentsFromCallArgument(
+            *this, TemplateParamsSt.get(), FuncParam, Init, Info, Deduced,
+            OriginalCallArgs, /*Decomposed*/ false, /*ArgIdx*/ 0, /*TDF*/ 0))
       return DeductionFailed();
   }
 
@@ -4112,12 +4170,14 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *&Init, QualType &Result,
 
   // Check that the deduced argument type is compatible with the original
   // argument type per C++ [temp.deduct.call]p4.
-  if (!InitList && !Result.isNull() &&
-      CheckOriginalCallArgDeduction(*this,
-                                    Sema::OriginalCallArg(FuncParam,0,InitType),
-                                    Result)) {
-    Result = QualType();
-    return DeductionFailed();
+  QualType DeducedA = InitList ? Deduced[0].getAsType() : Result;
+  for (const OriginalCallArg &OriginalArg : OriginalCallArgs) {
+    assert((bool)InitList == OriginalArg.DecomposedParam &&
+           "decomposed non-init-list in auto deduction?");
+    if (CheckOriginalCallArgDeduction(*this, OriginalArg, DeducedA)) {
+      Result = QualType();
+      return DeductionFailed();
+    }
   }
 
   return DAR_Succeeded;
diff --git a/lib/Sema/SemaTemplateInstantiateDecl.cpp b/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 7328dcb8760f..f4013b820641 100644
--- a/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1470,8 +1470,11 @@ Decl *TemplateDeclInstantiator::VisitCXXRecordDecl(CXXRecordDecl *D) {
                              TSK_ImplicitInstantiation,
                              /*Complain=*/true);
 
-    SemaRef.InstantiateClassMembers(D->getLocation(), Record, TemplateArgs,
-                                    TSK_ImplicitInstantiation);
+    // For nested local classes, we will instantiate the members when we
+    // reach the end of the outermost (non-nested) local class.
+    if (!D->isCXXClassMember())
+      SemaRef.InstantiateClassMembers(D->getLocation(), Record, TemplateArgs,
+                                      TSK_ImplicitInstantiation);
 
     // This class may have local implicit instantiations that need to be
     // performed within this scope.
@@ -3616,6 +3619,27 @@ TemplateDeclInstantiator::InitMethodInstantiation(CXXMethodDecl *New,
   return false;
 }
 
+/// In the MS ABI, we need to instantiate default arguments of dllexported
+/// default constructors along with the constructor definition. This allows IR
+/// gen to emit a constructor closure which calls the default constructor with
+/// its default arguments.
+static void InstantiateDefaultCtorDefaultArgs(Sema &S,
+                                              CXXConstructorDecl *Ctor) {
+  assert(S.Context.getTargetInfo().getCXXABI().isMicrosoft() &&
+         Ctor->isDefaultConstructor());
+  unsigned NumParams = Ctor->getNumParams();
+  if (NumParams == 0)
+    return;
+  DLLExportAttr *Attr = Ctor->getAttr<DLLExportAttr>();
+  if (!Attr)
+    return;
+  for (unsigned I = 0; I != NumParams; ++I) {
+    (void)S.CheckCXXDefaultArgExpr(Attr->getLocation(), Ctor,
+                                   Ctor->getParamDecl(I));
+    S.DiscardCleanupsInEvaluationContext();
+  }
+}
+
 /// \brief Instantiate the definition of the given function from its
 /// template.
 ///
@@ -3793,11 +3817,17 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
                                          TemplateArgs))
       return;
 
-    // If this is a constructor, instantiate the member initializers.
-    if (const CXXConstructorDecl *Ctor =
-          dyn_cast<CXXConstructorDecl>(PatternDecl)) {
-      InstantiateMemInitializers(cast<CXXConstructorDecl>(Function), Ctor,
+    if (CXXConstructorDecl *Ctor = dyn_cast<CXXConstructorDecl>(Function)) {
+      // If this is a constructor, instantiate the member initializers.
+      InstantiateMemInitializers(Ctor, cast<CXXConstructorDecl>(PatternDecl),
                                  TemplateArgs);
+
+      // If this is an MS ABI dllexport default constructor, instantiate any
+      // default arguments.
+      if (Context.getTargetInfo().getCXXABI().isMicrosoft() &&
+          Ctor->isDefaultConstructor()) {
+        InstantiateDefaultCtorDefaultArgs(*this, Ctor);
+      }
     }
 
     // Instantiate the function body.
diff --git a/lib/Serialization/ASTReader.cpp b/lib/Serialization/ASTReader.cpp
index fe2c53b77e1d..7f890051e641 100644
--- a/lib/Serialization/ASTReader.cpp
+++ b/lib/Serialization/ASTReader.cpp
@@ -8890,44 +8890,26 @@ void ASTReader::pushExternalDeclIntoScope(NamedDecl *D, DeclarationName Name) {
   }
 }
 
-ASTReader::ASTReader(
-  Preprocessor &PP, ASTContext &Context,
-  const PCHContainerReader &PCHContainerRdr,
-  ArrayRef<IntrusiveRefCntPtr<ModuleFileExtension>> Extensions,
-  StringRef isysroot, bool DisableValidation,
-  bool AllowASTWithCompilerErrors,
-  bool AllowConfigurationMismatch, bool ValidateSystemInputs,
-  bool UseGlobalIndex,
-  std::unique_ptr<llvm::Timer> ReadTimer)
-    : Listener(DisableValidation ?
-        cast<ASTReaderListener>(new SimpleASTReaderListener(PP)) :
-        cast<ASTReaderListener>(new PCHValidator(PP, *this))),
-      DeserializationListener(nullptr),
-      OwnsDeserializationListener(false), SourceMgr(PP.getSourceManager()),
-      FileMgr(PP.getFileManager()), PCHContainerRdr(PCHContainerRdr),
-      Diags(PP.getDiagnostics()), SemaObj(nullptr), PP(PP), Context(Context),
-      Consumer(nullptr), ModuleMgr(PP.getFileManager(), PCHContainerRdr),
-      DummyIdResolver(PP),
-      ReadTimer(std::move(ReadTimer)),
-      PragmaMSStructState(-1),
-      PragmaMSPointersToMembersState(-1),
-      isysroot(isysroot), DisableValidation(DisableValidation),
+ASTReader::ASTReader(Preprocessor &PP, ASTContext &Context,
+                     const PCHContainerReader &PCHContainerRdr,
+                     ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
+                     StringRef isysroot, bool DisableValidation,
+                     bool AllowASTWithCompilerErrors,
+                     bool AllowConfigurationMismatch, bool ValidateSystemInputs,
+                     bool UseGlobalIndex,
+                     std::unique_ptr<llvm::Timer> ReadTimer)
+    : Listener(DisableValidation
+                   ? cast<ASTReaderListener>(new SimpleASTReaderListener(PP))
+                   : cast<ASTReaderListener>(new PCHValidator(PP, *this))),
+      SourceMgr(PP.getSourceManager()), FileMgr(PP.getFileManager()),
+      PCHContainerRdr(PCHContainerRdr), Diags(PP.getDiagnostics()), PP(PP),
+      Context(Context), ModuleMgr(PP.getFileManager(), PCHContainerRdr),
+      DummyIdResolver(PP), ReadTimer(std::move(ReadTimer)), isysroot(isysroot),
+      DisableValidation(DisableValidation),
       AllowASTWithCompilerErrors(AllowASTWithCompilerErrors),
       AllowConfigurationMismatch(AllowConfigurationMismatch),
       ValidateSystemInputs(ValidateSystemInputs),
-      UseGlobalIndex(UseGlobalIndex), TriedLoadingGlobalIndex(false),
-      ProcessingUpdateRecords(false),
-      CurrSwitchCaseStmts(&SwitchCaseStmts), NumSLocEntriesRead(0),
-      TotalNumSLocEntries(0), NumStatementsRead(0), TotalNumStatements(0),
-      NumMacrosRead(0), TotalNumMacros(0), NumIdentifierLookups(0),
-      NumIdentifierLookupHits(0), NumSelectorsRead(0),
-      NumMethodPoolEntriesRead(0), NumMethodPoolLookups(0),
-      NumMethodPoolHits(0), NumMethodPoolTableLookups(0),
-      NumMethodPoolTableHits(0), TotalNumMethodPoolEntries(0),
-      NumLexicalDeclContextsRead(0), TotalLexicalDeclContexts(0),
-      NumVisibleDeclContextsRead(0), TotalVisibleDeclContexts(0),
-      TotalModulesSizeInBits(0), NumCurrentElementsDeserializing(0),
-      PassingDeclsToConsumer(false), ReadingKind(Read_None) {
+      UseGlobalIndex(UseGlobalIndex), CurrSwitchCaseStmts(&SwitchCaseStmts) {
   SourceMgr.setExternalSLocEntrySource(this);
 
   for (const auto &Ext : Extensions) {
diff --git a/lib/Serialization/ASTWriter.cpp b/lib/Serialization/ASTWriter.cpp
index 6d79ea53b659..2a5eda436f09 100644
--- a/lib/Serialization/ASTWriter.cpp
+++ b/lib/Serialization/ASTWriter.cpp
@@ -800,17 +800,17 @@ void TypeLocWriter::VisitPipeTypeLoc(PipeTypeLoc TL) {
 void ASTWriter::WriteTypeAbbrevs() {
   using namespace llvm;
 
-  BitCodeAbbrev *Abv;
+  std::shared_ptr<BitCodeAbbrev> Abv;
 
   // Abbreviation for TYPE_EXT_QUAL
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::TYPE_EXT_QUAL));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Type
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 3));   // Quals
-  TypeExtQualAbbrev = Stream.EmitAbbrev(Abv);
+  TypeExtQualAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for TYPE_FUNCTION_PROTO
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::TYPE_FUNCTION_PROTO));
   // FunctionType
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // ReturnType
@@ -828,7 +828,7 @@ void ASTWriter::WriteTypeAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // NumParams
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Params
-  TypeFunctionProtoAbbrev = Stream.EmitAbbrev(Abv);
+  TypeFunctionProtoAbbrev = Stream.EmitAbbrev(std::move(Abv));
 }
 
 //===----------------------------------------------------------------------===//
@@ -1323,7 +1323,7 @@ uint64_t ASTWriter::WriteControlBlock(Preprocessor &PP,
   RecordData Record;
   
   // Metadata
-  auto *MetadataAbbrev = new BitCodeAbbrev();
+  auto MetadataAbbrev = std::make_shared<BitCodeAbbrev>();
   MetadataAbbrev->Add(BitCodeAbbrevOp(METADATA));
   MetadataAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 16)); // Major
   MetadataAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 16)); // Minor
@@ -1333,7 +1333,7 @@ uint64_t ASTWriter::WriteControlBlock(Preprocessor &PP,
   MetadataAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Timestamps
   MetadataAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Errors
   MetadataAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // SVN branch/tag
-  unsigned MetadataAbbrevCode = Stream.EmitAbbrev(MetadataAbbrev);
+  unsigned MetadataAbbrevCode = Stream.EmitAbbrev(std::move(MetadataAbbrev));
   assert((!WritingModule || isysroot.empty()) &&
          "writing module as a relocatable PCH?");
   {
@@ -1356,10 +1356,10 @@ uint64_t ASTWriter::WriteControlBlock(Preprocessor &PP,
     }
 
     // Module name
-    auto *Abbrev = new BitCodeAbbrev();
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(MODULE_NAME));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Name
-    unsigned AbbrevCode = Stream.EmitAbbrev(Abbrev);
+    unsigned AbbrevCode = Stream.EmitAbbrev(std::move(Abbrev));
     RecordData::value_type Record[] = {MODULE_NAME};
     Stream.EmitRecordWithBlob(AbbrevCode, Record, WritingModule->Name);
   }
@@ -1376,10 +1376,10 @@ uint64_t ASTWriter::WriteControlBlock(Preprocessor &PP,
              .ModuleMapFileHomeIsCwd ||
         WritingModule->Directory->getName() != StringRef(".")) {
       // Module directory.
-      auto *Abbrev = new BitCodeAbbrev();
+      auto Abbrev = std::make_shared<BitCodeAbbrev>();
       Abbrev->Add(BitCodeAbbrevOp(MODULE_DIRECTORY));
       Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Directory
-      unsigned AbbrevCode = Stream.EmitAbbrev(Abbrev);
+      unsigned AbbrevCode = Stream.EmitAbbrev(std::move(Abbrev));
 
       RecordData::value_type Record[] = {MODULE_DIRECTORY};
       Stream.EmitRecordWithBlob(AbbrevCode, Record, BaseDir);
@@ -1586,11 +1586,11 @@ uint64_t ASTWriter::WriteControlBlock(Preprocessor &PP,
   // Original file name and file ID
   SourceManager &SM = Context.getSourceManager();
   if (const FileEntry *MainFile = SM.getFileEntryForID(SM.getMainFileID())) {
-    auto *FileAbbrev = new BitCodeAbbrev();
+    auto FileAbbrev = std::make_shared<BitCodeAbbrev>();
     FileAbbrev->Add(BitCodeAbbrevOp(ORIGINAL_FILE));
     FileAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // File ID
     FileAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // File name
-    unsigned FileAbbrevCode = Stream.EmitAbbrev(FileAbbrev);
+    unsigned FileAbbrevCode = Stream.EmitAbbrev(std::move(FileAbbrev));
 
     Record.clear();
     Record.push_back(ORIGINAL_FILE);
@@ -1604,10 +1604,10 @@ uint64_t ASTWriter::WriteControlBlock(Preprocessor &PP,
 
   // Original PCH directory
   if (!OutputFile.empty() && OutputFile != "-") {
-    auto *Abbrev = new BitCodeAbbrev();
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(ORIGINAL_PCH_DIR));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // File name
-    unsigned AbbrevCode = Stream.EmitAbbrev(Abbrev);
+    unsigned AbbrevCode = Stream.EmitAbbrev(std::move(Abbrev));
 
     SmallString<128> OutputPath(OutputFile);
 
@@ -1644,7 +1644,7 @@ void ASTWriter::WriteInputFiles(SourceManager &SourceMgr,
   Stream.EnterSubblock(INPUT_FILES_BLOCK_ID, 4);
 
   // Create input-file abbreviation.
-  auto *IFAbbrev = new BitCodeAbbrev();
+  auto IFAbbrev = std::make_shared<BitCodeAbbrev>();
   IFAbbrev->Add(BitCodeAbbrevOp(INPUT_FILE));
   IFAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ID
   IFAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 12)); // Size
@@ -1652,7 +1652,7 @@ void ASTWriter::WriteInputFiles(SourceManager &SourceMgr,
   IFAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Overridden
   IFAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Transient
   IFAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // File name
-  unsigned IFAbbrevCode = Stream.EmitAbbrev(IFAbbrev);
+  unsigned IFAbbrevCode = Stream.EmitAbbrev(std::move(IFAbbrev));
 
   // Get all ContentCache objects for files, sorted by whether the file is a
   // system one or not. System files go at the back, users files at the front.
@@ -1712,13 +1712,13 @@ void ASTWriter::WriteInputFiles(SourceManager &SourceMgr,
   Stream.ExitBlock();
 
   // Create input file offsets abbreviation.
-  auto *OffsetsAbbrev = new BitCodeAbbrev();
+  auto OffsetsAbbrev = std::make_shared<BitCodeAbbrev>();
   OffsetsAbbrev->Add(BitCodeAbbrevOp(INPUT_FILE_OFFSETS));
   OffsetsAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // # input files
   OffsetsAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // # non-system
                                                                 //   input files
   OffsetsAbbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));   // Array
-  unsigned OffsetsAbbrevCode = Stream.EmitAbbrev(OffsetsAbbrev);
+  unsigned OffsetsAbbrevCode = Stream.EmitAbbrev(std::move(OffsetsAbbrev));
 
   // Write input file offsets.
   RecordData::value_type Record[] = {INPUT_FILE_OFFSETS,
@@ -1735,7 +1735,7 @@ void ASTWriter::WriteInputFiles(SourceManager &SourceMgr,
 static unsigned CreateSLocFileAbbrev(llvm::BitstreamWriter &Stream) {
   using namespace llvm;
 
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SM_SLOC_FILE_ENTRY));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // Offset
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // Include location
@@ -1746,7 +1746,7 @@ static unsigned CreateSLocFileAbbrev(llvm::BitstreamWriter &Stream) {
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // NumCreatedFIDs
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 24)); // FirstDeclIndex
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // NumDecls
-  return Stream.EmitAbbrev(Abbrev);
+  return Stream.EmitAbbrev(std::move(Abbrev));
 }
 
 /// \brief Create an abbreviation for the SLocEntry that refers to a
@@ -1754,14 +1754,14 @@ static unsigned CreateSLocFileAbbrev(llvm::BitstreamWriter &Stream) {
 static unsigned CreateSLocBufferAbbrev(llvm::BitstreamWriter &Stream) {
   using namespace llvm;
 
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SM_SLOC_BUFFER_ENTRY));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // Offset
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // Include location
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2)); // Characteristic
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Line directives
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Buffer name blob
-  return Stream.EmitAbbrev(Abbrev);
+  return Stream.EmitAbbrev(std::move(Abbrev));
 }
 
 /// \brief Create an abbreviation for the SLocEntry that refers to a
@@ -1770,13 +1770,13 @@ static unsigned CreateSLocBufferBlobAbbrev(llvm::BitstreamWriter &Stream,
                                            bool Compressed) {
   using namespace llvm;
 
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(Compressed ? SM_SLOC_BUFFER_BLOB_COMPRESSED
                                          : SM_SLOC_BUFFER_BLOB));
   if (Compressed)
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // Uncompressed size
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Blob
-  return Stream.EmitAbbrev(Abbrev);
+  return Stream.EmitAbbrev(std::move(Abbrev));
 }
 
 /// \brief Create an abbreviation for the SLocEntry that refers to a macro
@@ -1784,14 +1784,14 @@ static unsigned CreateSLocBufferBlobAbbrev(llvm::BitstreamWriter &Stream,
 static unsigned CreateSLocExpansionAbbrev(llvm::BitstreamWriter &Stream) {
   using namespace llvm;
 
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SM_SLOC_EXPANSION_ENTRY));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // Offset
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // Spelling location
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // Start location
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // End location
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Token length
-  return Stream.EmitAbbrev(Abbrev);
+  return Stream.EmitAbbrev(std::move(Abbrev));
 }
 
 namespace {
@@ -1966,13 +1966,13 @@ void ASTWriter::WriteHeaderSearch(const HeaderSearch &HS) {
   // Create a blob abbreviation
   using namespace llvm;
 
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(HEADER_SEARCH_TABLE));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-  unsigned TableAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned TableAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
   
   // Write the header search table
   RecordData::value_type Record[] = {HEADER_SEARCH_TABLE, BucketOffset,
@@ -2136,12 +2136,12 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr,
   // table is used for lazily loading source-location information.
   using namespace llvm;
 
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SOURCE_LOCATION_OFFSETS));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 16)); // # of slocs
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 16)); // total size
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // offsets
-  unsigned SLocOffsetsAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned SLocOffsetsAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
   {
     RecordData::value_type Record[] = {
         SOURCE_LOCATION_OFFSETS, SLocEntryOffsets.size(),
@@ -2391,13 +2391,13 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
   // Write the offsets table for macro IDs.
   using namespace llvm;
 
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(MACRO_OFFSET));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // # of macros
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // first ID
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
 
-  unsigned MacroOffsetAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned MacroOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
   {
     RecordData::value_type Record[] = {MACRO_OFFSET, MacroOffsets.size(),
                                        FirstMacroID - NUM_PREDEF_MACRO_IDS};
@@ -2421,14 +2421,14 @@ void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec) {
   // Set up the abbreviation for 
   unsigned InclusionAbbrev = 0;
   {
-    auto *Abbrev = new BitCodeAbbrev();
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(PPD_INCLUSION_DIRECTIVE));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // filename length
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // in quotes
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2)); // kind
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // imported module
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-    InclusionAbbrev = Stream.EmitAbbrev(Abbrev);
+    InclusionAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
   }
   
   unsigned FirstPreprocessorEntityID 
@@ -2491,11 +2491,11 @@ void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec) {
     // Write the offsets table for identifier IDs.
     using namespace llvm;
 
-    auto *Abbrev = new BitCodeAbbrev();
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(PPD_ENTITIES_OFFSETS));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // first pp entity
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-    unsigned PPEOffsetAbbrev = Stream.EmitAbbrev(Abbrev);
+    unsigned PPEOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
     RecordData::value_type Record[] = {PPD_ENTITIES_OFFSETS,
                                        FirstPreprocessorEntityID -
@@ -2549,7 +2549,7 @@ void ASTWriter::WriteSubmodules(Module *WritingModule) {
   // Write the abbreviations needed for the submodules block.
   using namespace llvm;
 
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_DEFINITION));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ID
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Parent
@@ -2562,70 +2562,70 @@ void ASTWriter::WriteSubmodules(Module *WritingModule) {
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // InferExportWild...
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ConfigMacrosExh...
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Name
-  unsigned DefinitionAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned DefinitionAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_UMBRELLA_HEADER));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Name
-  unsigned UmbrellaAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned UmbrellaAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_HEADER));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Name
-  unsigned HeaderAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned HeaderAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_TOPHEADER));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Name
-  unsigned TopHeaderAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned TopHeaderAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_UMBRELLA_DIR));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Name
-  unsigned UmbrellaDirAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned UmbrellaDirAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_REQUIRES));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // State
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));     // Feature
-  unsigned RequiresAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned RequiresAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_EXCLUDED_HEADER));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Name
-  unsigned ExcludedHeaderAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned ExcludedHeaderAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_TEXTUAL_HEADER));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Name
-  unsigned TextualHeaderAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned TextualHeaderAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_PRIVATE_HEADER));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Name
-  unsigned PrivateHeaderAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned PrivateHeaderAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_PRIVATE_TEXTUAL_HEADER));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // Name
-  unsigned PrivateTextualHeaderAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned PrivateTextualHeaderAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_LINK_LIBRARY));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsFramework
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));     // Name
-  unsigned LinkLibraryAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned LinkLibraryAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_CONFIG_MACRO));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));    // Macro name
-  unsigned ConfigMacroAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned ConfigMacroAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_CONFLICT));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));  // Other module
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));    // Message
-  unsigned ConflictAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned ConflictAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
   // Write the submodule metadata block.
   RecordData::value_type Record[] = {getNumberOfModules(WritingModule),
@@ -2891,12 +2891,12 @@ void ASTWriter::WriteTypeDeclOffsets() {
   using namespace llvm;
 
   // Write the type offsets array
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(TYPE_OFFSET));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // # of types
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // base type index
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // types block
-  unsigned TypeOffsetAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned TypeOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
   {
     RecordData::value_type Record[] = {TYPE_OFFSET, TypeOffsets.size(),
                                        FirstTypeID - NUM_PREDEF_TYPE_IDS};
@@ -2904,12 +2904,12 @@ void ASTWriter::WriteTypeDeclOffsets() {
   }
 
   // Write the declaration offsets array
-  Abbrev = new BitCodeAbbrev();
+  Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(DECL_OFFSET));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // # of declarations
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // base decl ID
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // declarations block
-  unsigned DeclOffsetAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned DeclOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
   {
     RecordData::value_type Record[] = {DECL_OFFSET, DeclOffsets.size(),
                                        FirstDeclID - NUM_PREDEF_DECL_IDS};
@@ -2934,11 +2934,11 @@ void ASTWriter::WriteFileDeclIDsMap() {
       FileGroupedDeclIDs.push_back(LocDeclEntry.second);
   }
 
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(FILE_SORTED_DECLS));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-  unsigned AbbrevCode = Stream.EmitAbbrev(Abbrev);
+  unsigned AbbrevCode = Stream.EmitAbbrev(std::move(Abbrev));
   RecordData::value_type Record[] = {FILE_SORTED_DECLS,
                                      FileGroupedDeclIDs.size()};
   Stream.EmitRecordWithBlob(AbbrevCode, Record, bytes(FileGroupedDeclIDs));
@@ -3142,12 +3142,12 @@ void ASTWriter::WriteSelectors(Sema &SemaRef) {
     }
 
     // Create a blob abbreviation
-    auto *Abbrev = new BitCodeAbbrev();
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(METHOD_POOL));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-    unsigned MethodPoolAbbrev = Stream.EmitAbbrev(Abbrev);
+    unsigned MethodPoolAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
     // Write the method pool
     {
@@ -3157,12 +3157,12 @@ void ASTWriter::WriteSelectors(Sema &SemaRef) {
     }
 
     // Create a blob abbreviation for the selector table offsets.
-    Abbrev = new BitCodeAbbrev();
+    Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(SELECTOR_OFFSETS));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // size
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // first ID
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-    unsigned SelectorOffsetAbbrev = Stream.EmitAbbrev(Abbrev);
+    unsigned SelectorOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
     // Write the selector offsets table.
     {
@@ -3452,11 +3452,11 @@ void ASTWriter::WriteIdentifierTable(Preprocessor &PP,
     }
 
     // Create a blob abbreviation
-    auto *Abbrev = new BitCodeAbbrev();
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(IDENTIFIER_TABLE));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-    unsigned IDTableAbbrev = Stream.EmitAbbrev(Abbrev);
+    unsigned IDTableAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
     // Write the identifier table
     RecordData::value_type Record[] = {IDENTIFIER_TABLE, BucketOffset};
@@ -3464,12 +3464,12 @@ void ASTWriter::WriteIdentifierTable(Preprocessor &PP,
   }
 
   // Write the offsets table for identifier IDs.
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(IDENTIFIER_OFFSET));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // # of identifiers
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // first ID
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-  unsigned IdentifierOffsetAbbrev = Stream.EmitAbbrev(Abbrev);
+  unsigned IdentifierOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
 #ifndef NDEBUG
   for (unsigned I = 0, N = IdentifierOffsets.size(); I != N; ++I)
@@ -4025,11 +4025,11 @@ void ASTWriter::WriteObjCCategories() {
   // Emit the categories map.
   using namespace llvm;
 
-  auto *Abbrev = new BitCodeAbbrev();
+  auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(OBJC_CATEGORIES_MAP));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // # of entries
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-  unsigned AbbrevID = Stream.EmitAbbrev(Abbrev);
+  unsigned AbbrevID = Stream.EmitAbbrev(std::move(Abbrev));
 
   RecordData::value_type Record[] = {OBJC_CATEGORIES_MAP, CategoriesMap.size()};
   Stream.EmitRecordWithBlob(AbbrevID, Record,
@@ -4091,14 +4091,14 @@ void ASTWriter::WriteModuleFileExtension(Sema &SemaRef,
   Stream.EnterSubblock(EXTENSION_BLOCK_ID, 4);
 
   // Emit the metadata record abbreviation.
-  auto *Abv = new llvm::BitCodeAbbrev();
+  auto Abv = std::make_shared<llvm::BitCodeAbbrev>();
   Abv->Add(llvm::BitCodeAbbrevOp(EXTENSION_METADATA));
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6));
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6));
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6));
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6));
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob));
-  unsigned Abbrev = Stream.EmitAbbrev(Abv);
+  unsigned Abbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Emit the metadata record.
   RecordData Record;
@@ -4221,29 +4221,10 @@ void ASTWriter::SetSelectorOffset(Selector Sel, uint32_t Offset) {
   SelectorOffsets[ID - FirstSelectorID] = Offset;
 }
 
-ASTWriter::ASTWriter(
-  llvm::BitstreamWriter &Stream,
-  ArrayRef<llvm::IntrusiveRefCntPtr<ModuleFileExtension>> Extensions,
-  bool IncludeTimestamps)
-    : Stream(Stream), Context(nullptr), PP(nullptr), Chain(nullptr),
-      WritingModule(nullptr), IncludeTimestamps(IncludeTimestamps),
-      WritingAST(false), DoneWritingDeclsAndTypes(false),
-      ASTHasCompilerErrors(false), FirstDeclID(NUM_PREDEF_DECL_IDS),
-      NextDeclID(FirstDeclID), FirstTypeID(NUM_PREDEF_TYPE_IDS),
-      NextTypeID(FirstTypeID), FirstIdentID(NUM_PREDEF_IDENT_IDS),
-      NextIdentID(FirstIdentID), FirstMacroID(NUM_PREDEF_MACRO_IDS),
-      NextMacroID(FirstMacroID), FirstSubmoduleID(NUM_PREDEF_SUBMODULE_IDS),
-      NextSubmoduleID(FirstSubmoduleID),
-      FirstSelectorID(NUM_PREDEF_SELECTOR_IDS), NextSelectorID(FirstSelectorID),
-      NumStatements(0), NumMacros(0),
-      NumLexicalDeclContexts(0), NumVisibleDeclContexts(0),
-      TypeExtQualAbbrev(0), TypeFunctionProtoAbbrev(0), DeclParmVarAbbrev(0),
-      DeclContextLexicalAbbrev(0), DeclContextVisibleLookupAbbrev(0),
-      UpdateVisibleAbbrev(0), DeclRecordAbbrev(0), DeclTypedefAbbrev(0),
-      DeclVarAbbrev(0), DeclFieldAbbrev(0), DeclEnumAbbrev(0),
-      DeclObjCIvarAbbrev(0), DeclCXXMethodAbbrev(0), DeclRefExprAbbrev(0),
-      CharacterLiteralAbbrev(0), IntegerLiteralAbbrev(0),
-      ExprImplicitCastAbbrev(0) {
+ASTWriter::ASTWriter(llvm::BitstreamWriter &Stream,
+                     ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
+                     bool IncludeTimestamps)
+    : Stream(Stream), IncludeTimestamps(IncludeTimestamps) {
   for (const auto &Ext : Extensions) {
     if (auto Writer = Ext->createExtensionWriter(*this))
       ModuleFileExtensionWriters.push_back(std::move(Writer));
@@ -4474,10 +4455,10 @@ uint64_t ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
     }
   }
   
-  auto *Abv = new llvm::BitCodeAbbrev();
+  auto Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(llvm::BitCodeAbbrevOp(TU_UPDATE_LEXICAL));
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob));
-  unsigned TuUpdateLexicalAbbrev = Stream.EmitAbbrev(Abv);
+  unsigned TuUpdateLexicalAbbrev = Stream.EmitAbbrev(std::move(Abv));
   {
     RecordData::value_type Record[] = {TU_UPDATE_LEXICAL};
     Stream.EmitRecordWithBlob(TuUpdateLexicalAbbrev, Record,
@@ -4485,11 +4466,11 @@ uint64_t ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
   }
 
   // And a visible updates block for the translation unit.
-  Abv = new llvm::BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(llvm::BitCodeAbbrevOp(UPDATE_VISIBLE));
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6));
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob));
-  UpdateVisibleAbbrev = Stream.EmitAbbrev(Abv);
+  UpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv));
   WriteDeclContextVisibleUpdate(TU);
 
   // If we have any extern "C" names, write out a visible update for them.
@@ -4584,10 +4565,10 @@ uint64_t ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
     //   c++-base-specifiers-id:i32
     //   type-id:i32)
     // 
-    auto *Abbrev = new BitCodeAbbrev();
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(MODULE_OFFSET_MAP));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-    unsigned ModuleOffsetMapAbbrev = Stream.EmitAbbrev(Abbrev);
+    unsigned ModuleOffsetMapAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
     SmallString<2048> Buffer;
     {
       llvm::raw_svector_ostream Out(Buffer);
diff --git a/lib/Serialization/ASTWriterDecl.cpp b/lib/Serialization/ASTWriterDecl.cpp
index ee220f00a81f..8e1480739a5f 100644
--- a/lib/Serialization/ASTWriterDecl.cpp
+++ b/lib/Serialization/ASTWriterDecl.cpp
@@ -1702,10 +1702,10 @@ void ASTDeclWriter::VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D) {
 void ASTWriter::WriteDeclAbbrevs() {
   using namespace llvm;
 
-  BitCodeAbbrev *Abv;
+  std::shared_ptr<BitCodeAbbrev> Abv;
 
   // Abbreviation for DECL_FIELD
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_FIELD));
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
@@ -1735,10 +1735,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TypeLoc
-  DeclFieldAbbrev = Stream.EmitAbbrev(Abv);
+  DeclFieldAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_OBJC_IVAR
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_OBJC_IVAR));
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
@@ -1771,10 +1771,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TypeLoc
-  DeclObjCIvarAbbrev = Stream.EmitAbbrev(Abv);
+  DeclObjCIvarAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_ENUM
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_ENUM));
   // Redeclarable
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
@@ -1820,10 +1820,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   // DC
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LexicalOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // VisibleOffset
-  DeclEnumAbbrev = Stream.EmitAbbrev(Abv);
+  DeclEnumAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_RECORD
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_RECORD));
   // Redeclarable
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
@@ -1864,10 +1864,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   // DC
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LexicalOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // VisibleOffset
-  DeclRecordAbbrev = Stream.EmitAbbrev(Abv);
+  DeclRecordAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_PARM_VAR
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_PARM_VAR));
   // Redeclarable
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
@@ -1911,10 +1911,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TypeLoc
-  DeclParmVarAbbrev = Stream.EmitAbbrev(Abv);
+  DeclParmVarAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_TYPEDEF
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_TYPEDEF));
   // Redeclarable
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
@@ -1940,10 +1940,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   // TypedefDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TypeLoc
-  DeclTypedefAbbrev = Stream.EmitAbbrev(Abv);
+  DeclTypedefAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_VAR
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_VAR));
   // Redeclarable
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
@@ -1989,10 +1989,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TypeLoc
-  DeclVarAbbrev = Stream.EmitAbbrev(Abv);
+  DeclVarAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_CXX_METHOD
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_CXX_METHOD));
   // RedeclarableDecl
   Abv->Add(BitCodeAbbrevOp(0));                         // CanonicalDecl
@@ -2047,10 +2047,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   //  Add an AbbrevOp for 'size then elements' and use it here.
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  DeclCXXMethodAbbrev = Stream.EmitAbbrev(Abv);
+  DeclCXXMethodAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for EXPR_DECL_REF
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_DECL_REF));
   //Stmt
   //Expr
@@ -2070,10 +2070,10 @@ void ASTWriter::WriteDeclAbbrevs() {
                            1)); // RefersToEnclosingVariableOrCapture
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclRef
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Location
-  DeclRefExprAbbrev = Stream.EmitAbbrev(Abv);
+  DeclRefExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for EXPR_INTEGER_LITERAL
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_INTEGER_LITERAL));
   //Stmt
   //Expr
@@ -2088,10 +2088,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Location
   Abv->Add(BitCodeAbbrevOp(32));                      // Bit Width
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Value
-  IntegerLiteralAbbrev = Stream.EmitAbbrev(Abv);
+  IntegerLiteralAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for EXPR_CHARACTER_LITERAL
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_CHARACTER_LITERAL));
   //Stmt
   //Expr
@@ -2106,10 +2106,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // getValue
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Location
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // getKind
-  CharacterLiteralAbbrev = Stream.EmitAbbrev(Abv);
+  CharacterLiteralAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for EXPR_IMPLICIT_CAST
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_IMPLICIT_CAST));
   // Stmt
   // Expr
@@ -2124,17 +2124,17 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0)); // PathSize
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 6)); // CastKind
   // ImplicitCastExpr
-  ExprImplicitCastAbbrev = Stream.EmitAbbrev(Abv);
+  ExprImplicitCastAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_LEXICAL));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-  DeclContextLexicalAbbrev = Stream.EmitAbbrev(Abv);
+  DeclContextLexicalAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
-  Abv = new BitCodeAbbrev();
+  Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_VISIBLE));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-  DeclContextVisibleLookupAbbrev = Stream.EmitAbbrev(Abv);
+  DeclContextVisibleLookupAbbrev = Stream.EmitAbbrev(std::move(Abv));
 }
 
 /// isRequiredDecl - Check if this is a "required" Decl, which must be seen by
diff --git a/lib/Serialization/GeneratePCH.cpp b/lib/Serialization/GeneratePCH.cpp
index e1765dafd96f..7f1b75055b45 100644
--- a/lib/Serialization/GeneratePCH.cpp
+++ b/lib/Serialization/GeneratePCH.cpp
@@ -24,7 +24,7 @@ using namespace clang;
 PCHGenerator::PCHGenerator(
     const Preprocessor &PP, StringRef OutputFile, StringRef isysroot,
     std::shared_ptr<PCHBuffer> Buffer,
-    ArrayRef<llvm::IntrusiveRefCntPtr<ModuleFileExtension>> Extensions,
+    ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
     bool AllowASTWithErrors, bool IncludeTimestamps)
     : PP(PP), OutputFile(OutputFile), isysroot(isysroot.str()),
       SemaPtr(nullptr), Buffer(Buffer), Stream(Buffer->Data),
diff --git a/lib/Serialization/GlobalModuleIndex.cpp b/lib/Serialization/GlobalModuleIndex.cpp
index 9f986d54a989..ae5796ede126 100644
--- a/lib/Serialization/GlobalModuleIndex.cpp
+++ b/lib/Serialization/GlobalModuleIndex.cpp
@@ -744,11 +744,11 @@ void GlobalModuleIndexBuilder::writeIndex(llvm::BitstreamWriter &Stream) {
     }
 
     // Create a blob abbreviation
-    BitCodeAbbrev *Abbrev = new BitCodeAbbrev();
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(IDENTIFIER_INDEX));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-    unsigned IDTableAbbrev = Stream.EmitAbbrev(Abbrev);
+    unsigned IDTableAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
     // Write the identifier table
     uint64_t Record[] = {IDENTIFIER_INDEX, BucketOffset};
diff --git a/lib/StaticAnalyzer/Checkers/DynamicTypeChecker.cpp b/lib/StaticAnalyzer/Checkers/DynamicTypeChecker.cpp
index a37ebc506d04..109897be2931 100644
--- a/lib/StaticAnalyzer/Checkers/DynamicTypeChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/DynamicTypeChecker.cpp
@@ -49,10 +49,10 @@ class DynamicTypeChecker : public Checker<check::PostStmt<ImplicitCastExpr>> {
       ID.AddPointer(Reg);
     }
 
-    PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                   const ExplodedNode *PrevN,
-                                   BugReporterContext &BRC,
-                                   BugReport &BR) override;
+    std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                   const ExplodedNode *PrevN,
+                                                   BugReporterContext &BRC,
+                                                   BugReport &BR) override;
 
   private:
     // The tracked region.
@@ -91,9 +91,11 @@ void DynamicTypeChecker::reportTypeError(QualType DynamicType,
   C.emitReport(std::move(R));
 }
 
-PathDiagnosticPiece *DynamicTypeChecker::DynamicTypeBugVisitor::VisitNode(
-    const ExplodedNode *N, const ExplodedNode *PrevN, BugReporterContext &BRC,
-    BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+DynamicTypeChecker::DynamicTypeBugVisitor::VisitNode(const ExplodedNode *N,
+                                                     const ExplodedNode *PrevN,
+                                                     BugReporterContext &BRC,
+                                                     BugReport &BR) {
   ProgramStateRef State = N->getState();
   ProgramStateRef StatePrev = PrevN->getState();
 
@@ -143,7 +145,8 @@ PathDiagnosticPiece *DynamicTypeChecker::DynamicTypeBugVisitor::VisitNode(
   // Generate the extra diagnostic.
   PathDiagnosticLocation Pos(S, BRC.getSourceManager(),
                              N->getLocationContext());
-  return new PathDiagnosticEventPiece(Pos, OS.str(), true, nullptr);
+  return std::make_shared<PathDiagnosticEventPiece>(Pos, OS.str(), true,
+                                                    nullptr);
 }
 
 static bool hasDefinition(const ObjCObjectPointerType *ObjPtr) {
diff --git a/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp b/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
index a418c82f5a01..0891ea85a714 100644
--- a/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
+++ b/lib/StaticAnalyzer/Checkers/DynamicTypePropagation.cpp
@@ -83,10 +83,10 @@ class DynamicTypePropagation:
       ID.AddPointer(Sym);
     }
 
-    PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                   const ExplodedNode *PrevN,
-                                   BugReporterContext &BRC,
-                                   BugReport &BR) override;
+    std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                   const ExplodedNode *PrevN,
+                                                   BugReporterContext &BRC,
+                                                   BugReport &BR) override;
 
   private:
     // The tracked symbol.
@@ -923,9 +923,11 @@ void DynamicTypePropagation::reportGenericsBug(
   C.emitReport(std::move(R));
 }
 
-PathDiagnosticPiece *DynamicTypePropagation::GenericsBugVisitor::VisitNode(
-    const ExplodedNode *N, const ExplodedNode *PrevN, BugReporterContext &BRC,
-    BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+DynamicTypePropagation::GenericsBugVisitor::VisitNode(const ExplodedNode *N,
+                                                      const ExplodedNode *PrevN,
+                                                      BugReporterContext &BRC,
+                                                      BugReport &BR) {
   ProgramStateRef state = N->getState();
   ProgramStateRef statePrev = PrevN->getState();
 
@@ -975,7 +977,8 @@ PathDiagnosticPiece *DynamicTypePropagation::GenericsBugVisitor::VisitNode(
   // Generate the extra diagnostic.
   PathDiagnosticLocation Pos(S, BRC.getSourceManager(),
                              N->getLocationContext());
-  return new PathDiagnosticEventPiece(Pos, OS.str(), true, nullptr);
+  return std::make_shared<PathDiagnosticEventPiece>(Pos, OS.str(), true,
+                                                    nullptr);
 }
 
 /// Register checkers.
diff --git a/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp b/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
index d1dab6d27d45..af35c2b0e991 100644
--- a/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
@@ -123,10 +123,10 @@ public:
         assert(NonLocalizedString);
   }
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *Succ,
-                                 const ExplodedNode *Pred,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override;
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *Succ,
+                                                 const ExplodedNode *Pred,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override;
 
   void Profile(llvm::FoldingSetNodeID &ID) const override {
     ID.Add(NonLocalizedString);
@@ -910,7 +910,7 @@ void NonLocalizedStringChecker::checkPostStmt(const ObjCStringLiteral *SL,
   setNonLocalizedState(sv, C);
 }
 
-PathDiagnosticPiece *
+std::shared_ptr<PathDiagnosticPiece>
 NonLocalizedStringBRVisitor::VisitNode(const ExplodedNode *Succ,
                                        const ExplodedNode *Pred,
                                        BugReporterContext &BRC, BugReport &BR) {
@@ -938,11 +938,11 @@ NonLocalizedStringBRVisitor::VisitNode(const ExplodedNode *Succ,
   if (!L.isValid() || !L.asLocation().isValid())
     return nullptr;
 
-  auto *Piece = new PathDiagnosticEventPiece(L,
-      "Non-localized string literal here");
+  auto Piece = std::make_shared<PathDiagnosticEventPiece>(
+      L, "Non-localized string literal here");
   Piece->addRange(LiteralExpr->getSourceRange());
 
-  return Piece;
+  return std::move(Piece);
 }
 
 namespace {
diff --git a/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp
index d56ea6d689d3..e9ec7a0c4365 100644
--- a/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp
+++ b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp
@@ -85,9 +85,11 @@ void MPIBugReporter::reportUnmatchedWait(
   BReporter.emitReport(std::move(Report));
 }
 
-PathDiagnosticPiece *MPIBugReporter::RequestNodeVisitor::VisitNode(
-    const ExplodedNode *N, const ExplodedNode *PrevN, BugReporterContext &BRC,
-    BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+MPIBugReporter::RequestNodeVisitor::VisitNode(const ExplodedNode *N,
+                                              const ExplodedNode *PrevN,
+                                              BugReporterContext &BRC,
+                                              BugReport &BR) {
 
   if (IsNodeFound)
     return nullptr;
@@ -104,7 +106,7 @@ PathDiagnosticPiece *MPIBugReporter::RequestNodeVisitor::VisitNode(
     PathDiagnosticLocation L =
         PathDiagnosticLocation::create(P, BRC.getSourceManager());
 
-    return new PathDiagnosticEventPiece(L, ErrorText);
+    return std::make_shared<PathDiagnosticEventPiece>(L, ErrorText);
   }
 
   return nullptr;
diff --git a/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h
index 8474d2d194e8..0ee91cca4793 100644
--- a/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h
+++ b/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h
@@ -90,10 +90,10 @@ private:
       ID.AddPointer(RequestRegion);
     }
 
-    PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                   const ExplodedNode *PrevN,
-                                   BugReporterContext &BRC,
-                                   BugReport &BR) override;
+    std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                   const ExplodedNode *PrevN,
+                                                   BugReporterContext &BRC,
+                                                   BugReport &BR) override;
 
   private:
     const MemRegion *const RequestRegion;
diff --git a/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp b/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
index 86c827045e9a..f1aa16391db1 100644
--- a/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
@@ -143,10 +143,10 @@ private:
       ID.AddPointer(Sym);
     }
 
-    PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                   const ExplodedNode *PrevN,
-                                   BugReporterContext &BRC,
-                                   BugReport &BR) override;
+    std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                   const ExplodedNode *PrevN,
+                                                   BugReporterContext &BRC,
+                                                   BugReport &BR) override;
   };
 };
 }
@@ -583,12 +583,10 @@ void MacOSKeychainAPIChecker::checkDeadSymbols(SymbolReaper &SR,
   C.addTransition(State, N);
 }
 
-
-PathDiagnosticPiece *MacOSKeychainAPIChecker::SecKeychainBugVisitor::VisitNode(
-                                                      const ExplodedNode *N,
-                                                      const ExplodedNode *PrevN,
-                                                      BugReporterContext &BRC,
-                                                      BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+MacOSKeychainAPIChecker::SecKeychainBugVisitor::VisitNode(
+    const ExplodedNode *N, const ExplodedNode *PrevN, BugReporterContext &BRC,
+    BugReport &BR) {
   const AllocationState *AS = N->getState()->get<AllocatedData>(Sym);
   if (!AS)
     return nullptr;
@@ -610,7 +608,8 @@ PathDiagnosticPiece *MacOSKeychainAPIChecker::SecKeychainBugVisitor::VisitNode(
   const Expr *ArgExpr = CE->getArg(FunctionsToTrack[Idx].Param);
   PathDiagnosticLocation Pos(ArgExpr, BRC.getSourceManager(),
                              N->getLocationContext());
-  return new PathDiagnosticEventPiece(Pos, "Data is allocated here.");
+  return std::make_shared<PathDiagnosticEventPiece>(Pos,
+                                                    "Data is allocated here.");
 }
 
 void ento::registerMacOSKeychainAPIChecker(CheckerManager &mgr) {
diff --git a/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index f7c4ea10c438..8e839a1d28fd 100644
--- a/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -463,10 +463,10 @@ private:
                           SPrev->isAllocatedOfSizeZero())));
     }
 
-    PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                   const ExplodedNode *PrevN,
-                                   BugReporterContext &BRC,
-                                   BugReport &BR) override;
+    std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                   const ExplodedNode *PrevN,
+                                                   BugReporterContext &BRC,
+                                                   BugReport &BR) override;
 
     std::unique_ptr<PathDiagnosticPiece>
     getEndPath(BugReporterContext &BRC, const ExplodedNode *EndPathNode,
@@ -2668,11 +2668,9 @@ static SymbolRef findFailedReallocSymbol(ProgramStateRef currState,
   return nullptr;
 }
 
-PathDiagnosticPiece *
-MallocChecker::MallocBugVisitor::VisitNode(const ExplodedNode *N,
-                                           const ExplodedNode *PrevN,
-                                           BugReporterContext &BRC,
-                                           BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece> MallocChecker::MallocBugVisitor::VisitNode(
+    const ExplodedNode *N, const ExplodedNode *PrevN, BugReporterContext &BRC,
+    BugReport &BR) {
   ProgramStateRef state = N->getState();
   ProgramStateRef statePrev = PrevN->getState();
 
@@ -2740,7 +2738,7 @@ MallocChecker::MallocBugVisitor::VisitNode(const ExplodedNode *N,
   // Generate the extra diagnostic.
   PathDiagnosticLocation Pos(S, BRC.getSourceManager(),
                              N->getLocationContext());
-  return new PathDiagnosticEventPiece(Pos, Msg, true, StackHint);
+  return std::make_shared<PathDiagnosticEventPiece>(Pos, Msg, true, StackHint);
 }
 
 void MallocChecker::printState(raw_ostream &Out, ProgramStateRef State,
diff --git a/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp b/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
index d96017a1f532..c14a87c9d2a4 100644
--- a/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
@@ -153,10 +153,10 @@ private:
       ID.AddPointer(Region);
     }
 
-    PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                   const ExplodedNode *PrevN,
-                                   BugReporterContext &BRC,
-                                   BugReport &BR) override;
+    std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                   const ExplodedNode *PrevN,
+                                                   BugReporterContext &BRC,
+                                                   BugReport &BR) override;
 
   private:
     // The tracked region.
@@ -306,9 +306,11 @@ NullabilityChecker::getTrackRegion(SVal Val, bool CheckSuperRegion) const {
   return dyn_cast<SymbolicRegion>(Region);
 }
 
-PathDiagnosticPiece *NullabilityChecker::NullabilityBugVisitor::VisitNode(
-    const ExplodedNode *N, const ExplodedNode *PrevN, BugReporterContext &BRC,
-    BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+NullabilityChecker::NullabilityBugVisitor::VisitNode(const ExplodedNode *N,
+                                                     const ExplodedNode *PrevN,
+                                                     BugReporterContext &BRC,
+                                                     BugReport &BR) {
   ProgramStateRef State = N->getState();
   ProgramStateRef StatePrev = PrevN->getState();
 
@@ -339,7 +341,8 @@ PathDiagnosticPiece *NullabilityChecker::NullabilityBugVisitor::VisitNode(
   // Generate the extra diagnostic.
   PathDiagnosticLocation Pos(S, BRC.getSourceManager(),
                              N->getLocationContext());
-  return new PathDiagnosticEventPiece(Pos, InfoText, true, nullptr);
+  return std::make_shared<PathDiagnosticEventPiece>(Pos, InfoText, true,
+                                                    nullptr);
 }
 
 static Nullability getNullabilityAnnotation(QualType Type) {
diff --git a/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp b/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
index e75d20897710..075ff09dcbfa 100644
--- a/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
@@ -73,10 +73,10 @@ public:
       : ReceiverSymbol(ReceiverSymbol),
         Satisfied(false) {}
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *Succ,
-                                 const ExplodedNode *Pred,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override;
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *Succ,
+                                                 const ExplodedNode *Pred,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override;
 
   void Profile(llvm::FoldingSetNodeID &ID) const override {
     ID.Add(ReceiverSymbol);
@@ -249,10 +249,10 @@ ObjCSuperDeallocChecker::isSuperDeallocMessage(const ObjCMethodCall &M) const {
   return M.getSelector() == SELdealloc;
 }
 
-PathDiagnosticPiece *SuperDeallocBRVisitor::VisitNode(const ExplodedNode *Succ,
-                                                      const ExplodedNode *Pred,
-                                                      BugReporterContext &BRC,
-                                                      BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+SuperDeallocBRVisitor::VisitNode(const ExplodedNode *Succ,
+                                 const ExplodedNode *Pred,
+                                 BugReporterContext &BRC, BugReport &BR) {
   if (Satisfied)
     return nullptr;
 
@@ -275,7 +275,7 @@ PathDiagnosticPiece *SuperDeallocBRVisitor::VisitNode(const ExplodedNode *Succ,
     if (!L.isValid() || !L.asLocation().isValid())
       return nullptr;
 
-    return new PathDiagnosticEventPiece(
+    return std::make_shared<PathDiagnosticEventPiece>(
         L, "[super dealloc] called here");
   }
 
diff --git a/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp b/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
index 204b0a6c468b..eb101e12af25 100644
--- a/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
@@ -1773,10 +1773,10 @@ namespace {
       ID.AddPointer(Sym);
     }
 
-    PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                   const ExplodedNode *PrevN,
-                                   BugReporterContext &BRC,
-                                   BugReport &BR) override;
+    std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                   const ExplodedNode *PrevN,
+                                                   BugReporterContext &BRC,
+                                                   BugReport &BR) override;
 
     std::unique_ptr<PathDiagnosticPiece> getEndPath(BugReporterContext &BRC,
                                                     const ExplodedNode *N,
@@ -1899,10 +1899,9 @@ static bool isSynthesizedAccessor(const StackFrameContext *SFC) {
   return SFC->getAnalysisDeclContext()->isBodyAutosynthesized();
 }
 
-PathDiagnosticPiece *CFRefReportVisitor::VisitNode(const ExplodedNode *N,
-                                                   const ExplodedNode *PrevN,
-                                                   BugReporterContext &BRC,
-                                                   BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+CFRefReportVisitor::VisitNode(const ExplodedNode *N, const ExplodedNode *PrevN,
+                              BugReporterContext &BRC, BugReport &BR) {
   // FIXME: We will eventually need to handle non-statement-based events
   // (__attribute__((cleanup))).
   if (!N->getLocation().getAs<StmtPoint>())
@@ -2026,7 +2025,7 @@ PathDiagnosticPiece *CFRefReportVisitor::VisitNode(const ExplodedNode *N,
 
     PathDiagnosticLocation Pos(S, BRC.getSourceManager(),
                                   N->getLocationContext());
-    return new PathDiagnosticEventPiece(Pos, os.str());
+    return std::make_shared<PathDiagnosticEventPiece>(Pos, os.str());
   }
 
   // Gather up the effects that were performed on the object at this
@@ -2203,7 +2202,7 @@ PathDiagnosticPiece *CFRefReportVisitor::VisitNode(const ExplodedNode *N,
   const Stmt *S = N->getLocation().castAs<StmtPoint>().getStmt();
   PathDiagnosticLocation Pos(S, BRC.getSourceManager(),
                                 N->getLocationContext());
-  PathDiagnosticPiece *P = new PathDiagnosticEventPiece(Pos, os.str());
+  auto P = std::make_shared<PathDiagnosticEventPiece>(Pos, os.str());
 
   // Add the range by scanning the children of the statement for any bindings
   // to Sym.
@@ -2214,7 +2213,7 @@ PathDiagnosticPiece *CFRefReportVisitor::VisitNode(const ExplodedNode *N,
         break;
       }
 
-  return P;
+  return std::move(P);
 }
 
 namespace {
diff --git a/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp b/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp
index b794d2f86bbe..5268bbf5562e 100644
--- a/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp
@@ -70,10 +70,10 @@ public:
     ID.Add(SFC);
   }
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *Succ,
-                                 const ExplodedNode *Pred,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override;
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *Succ,
+                                                 const ExplodedNode *Pred,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override;
 };
 
 class TestAfterDivZeroChecker
@@ -94,10 +94,9 @@ public:
 
 REGISTER_SET_WITH_PROGRAMSTATE(DivZeroMap, ZeroState)
 
-PathDiagnosticPiece *DivisionBRVisitor::VisitNode(const ExplodedNode *Succ,
-                                                  const ExplodedNode *Pred,
-                                                  BugReporterContext &BRC,
-                                                  BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+DivisionBRVisitor::VisitNode(const ExplodedNode *Succ, const ExplodedNode *Pred,
+                             BugReporterContext &BRC, BugReport &BR) {
   if (Satisfied)
     return nullptr;
 
@@ -128,7 +127,7 @@ PathDiagnosticPiece *DivisionBRVisitor::VisitNode(const ExplodedNode *Succ,
     if (!L.isValid() || !L.asLocation().isValid())
       return nullptr;
 
-    return new PathDiagnosticEventPiece(
+    return std::make_shared<PathDiagnosticEventPiece>(
         L, "Division with compared value made here");
   }
 
diff --git a/lib/StaticAnalyzer/Checkers/ValistChecker.cpp b/lib/StaticAnalyzer/Checkers/ValistChecker.cpp
index b4bfa0c03341..0b7a4865ddc2 100644
--- a/lib/StaticAnalyzer/Checkers/ValistChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/ValistChecker.cpp
@@ -91,10 +91,10 @@ private:
       return llvm::make_unique<PathDiagnosticEventPiece>(L, BR.getDescription(),
                                                          false);
     }
-    PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                   const ExplodedNode *PrevN,
-                                   BugReporterContext &BRC,
-                                   BugReport &BR) override;
+    std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                   const ExplodedNode *PrevN,
+                                                   BugReporterContext &BRC,
+                                                   BugReport &BR) override;
 
   private:
     const MemRegion *Reg;
@@ -335,7 +335,7 @@ void ValistChecker::checkVAListEndCall(const CallEvent &Call,
   C.addTransition(State);
 }
 
-PathDiagnosticPiece *ValistChecker::ValistBugVisitor::VisitNode(
+std::shared_ptr<PathDiagnosticPiece> ValistChecker::ValistBugVisitor::VisitNode(
     const ExplodedNode *N, const ExplodedNode *PrevN, BugReporterContext &BRC,
     BugReport &BR) {
   ProgramStateRef State = N->getState();
@@ -358,7 +358,7 @@ PathDiagnosticPiece *ValistChecker::ValistBugVisitor::VisitNode(
 
   PathDiagnosticLocation Pos(S, BRC.getSourceManager(),
                              N->getLocationContext());
-  return new PathDiagnosticEventPiece(Pos, Msg, true);
+  return std::make_shared<PathDiagnosticEventPiece>(Pos, Msg, true);
 }
 
 #define REGISTER_CHECKER(name)                                                 \
diff --git a/lib/StaticAnalyzer/Core/BugReporter.cpp b/lib/StaticAnalyzer/Core/BugReporter.cpp
index 53b4e699f7ad..2114033ba8b5 100644
--- a/lib/StaticAnalyzer/Core/BugReporter.cpp
+++ b/lib/StaticAnalyzer/Core/BugReporter.cpp
@@ -111,15 +111,15 @@ static void removeRedundantMsgs(PathPieces &path) {
   // grabbing the front, processing it, and if we decide to keep it append
   // it to the end of the path.  The entire path is processed in this way.
   for (unsigned i = 0; i < N; ++i) {
-    IntrusiveRefCntPtr<PathDiagnosticPiece> piece(path.front());
+    auto piece = std::move(path.front());
     path.pop_front();
 
     switch (piece->getKind()) {
       case PathDiagnosticPiece::Call:
-        removeRedundantMsgs(cast<PathDiagnosticCallPiece>(piece)->path);
+        removeRedundantMsgs(cast<PathDiagnosticCallPiece>(*piece).path);
         break;
       case PathDiagnosticPiece::Macro:
-        removeRedundantMsgs(cast<PathDiagnosticMacroPiece>(piece)->subPieces);
+        removeRedundantMsgs(cast<PathDiagnosticMacroPiece>(*piece).subPieces);
         break;
       case PathDiagnosticPiece::ControlFlow:
         break;
@@ -130,13 +130,13 @@ static void removeRedundantMsgs(PathPieces &path) {
         if (PathDiagnosticEventPiece *nextEvent =
             dyn_cast<PathDiagnosticEventPiece>(path.front().get())) {
           PathDiagnosticEventPiece *event =
-            cast<PathDiagnosticEventPiece>(piece);
+              cast<PathDiagnosticEventPiece>(piece.get());
           // Check to see if we should keep one of the two pieces.  If we
           // come up with a preference, record which piece to keep, and consume
           // another piece from the path.
-          if (PathDiagnosticEventPiece *pieceToKeep =
-              eventsDescribeSameCondition(event, nextEvent)) {
-            piece = pieceToKeep;
+          if (auto *pieceToKeep =
+                  eventsDescribeSameCondition(event, nextEvent)) {
+            piece = std::move(pieceToKeep == event ? piece : path.front());
             path.pop_front();
             ++i;
           }
@@ -146,7 +146,7 @@ static void removeRedundantMsgs(PathPieces &path) {
       case PathDiagnosticPiece::Note:
         break;
     }
-    path.push_back(piece);
+    path.push_back(std::move(piece));
   }
 }
 
@@ -166,38 +166,38 @@ static bool removeUnneededCalls(PathPieces &pieces, BugReport *R,
   for (unsigned i = 0 ; i < N ; ++i) {
     // Remove the front piece from the path.  If it is still something we
     // want to keep once we are done, we will push it back on the end.
-    IntrusiveRefCntPtr<PathDiagnosticPiece> piece(pieces.front());
+    auto piece = std::move(pieces.front());
     pieces.pop_front();
 
     switch (piece->getKind()) {
       case PathDiagnosticPiece::Call: {
-        PathDiagnosticCallPiece *call = cast<PathDiagnosticCallPiece>(piece);
+        auto &call = cast<PathDiagnosticCallPiece>(*piece);
         // Check if the location context is interesting.
-        assert(LCM.count(&call->path));
-        if (R->isInteresting(LCM[&call->path])) {
+        assert(LCM.count(&call.path));
+        if (R->isInteresting(LCM[&call.path])) {
           containsSomethingInteresting = true;
           break;
         }
 
-        if (!removeUnneededCalls(call->path, R, LCM))
+        if (!removeUnneededCalls(call.path, R, LCM))
           continue;
 
         containsSomethingInteresting = true;
         break;
       }
       case PathDiagnosticPiece::Macro: {
-        PathDiagnosticMacroPiece *macro = cast<PathDiagnosticMacroPiece>(piece);
-        if (!removeUnneededCalls(macro->subPieces, R, LCM))
+        auto &macro = cast<PathDiagnosticMacroPiece>(*piece);
+        if (!removeUnneededCalls(macro.subPieces, R, LCM))
           continue;
         containsSomethingInteresting = true;
         break;
       }
       case PathDiagnosticPiece::Event: {
-        PathDiagnosticEventPiece *event = cast<PathDiagnosticEventPiece>(piece);
+        auto &event = cast<PathDiagnosticEventPiece>(*piece);
 
         // We never throw away an event, but we do throw it away wholesale
         // as part of a path if we throw the entire path away.
-        containsSomethingInteresting |= !event->isPrunable();
+        containsSomethingInteresting |= !event.isPrunable();
         break;
       }
       case PathDiagnosticPiece::ControlFlow:
@@ -207,7 +207,7 @@ static bool removeUnneededCalls(PathPieces &pieces, BugReport *R,
         break;
     }
 
-    pieces.push_back(piece);
+    pieces.push_back(std::move(piece));
   }
 
   return containsSomethingInteresting;
@@ -226,7 +226,7 @@ static void
 adjustCallLocations(PathPieces &Pieces,
                     PathDiagnosticLocation *LastCallLocation = nullptr) {
   for (PathPieces::iterator I = Pieces.begin(), E = Pieces.end(); I != E; ++I) {
-    PathDiagnosticCallPiece *Call = dyn_cast<PathDiagnosticCallPiece>(*I);
+    PathDiagnosticCallPiece *Call = dyn_cast<PathDiagnosticCallPiece>(I->get());
 
     if (!Call) {
       assert((*I)->getLocation().asLocation().isValid());
@@ -260,14 +260,13 @@ adjustCallLocations(PathPieces &Pieces,
 /// explicitly in a constructor or braced list.
 static void removeEdgesToDefaultInitializers(PathPieces &Pieces) {
   for (PathPieces::iterator I = Pieces.begin(), E = Pieces.end(); I != E;) {
-    if (PathDiagnosticCallPiece *C = dyn_cast<PathDiagnosticCallPiece>(*I))
+    if (auto *C = dyn_cast<PathDiagnosticCallPiece>(I->get()))
       removeEdgesToDefaultInitializers(C->path);
 
-    if (PathDiagnosticMacroPiece *M = dyn_cast<PathDiagnosticMacroPiece>(*I))
+    if (auto *M = dyn_cast<PathDiagnosticMacroPiece>(I->get()))
       removeEdgesToDefaultInitializers(M->subPieces);
 
-    if (PathDiagnosticControlFlowPiece *CF =
-          dyn_cast<PathDiagnosticControlFlowPiece>(*I)) {
+    if (auto *CF = dyn_cast<PathDiagnosticControlFlowPiece>(I->get())) {
       const Stmt *Start = CF->getStartLocation().asStmt();
       const Stmt *End = CF->getEndLocation().asStmt();
       if (Start && isa<CXXDefaultInitExpr>(Start)) {
@@ -276,8 +275,8 @@ static void removeEdgesToDefaultInitializers(PathPieces &Pieces) {
       } else if (End && isa<CXXDefaultInitExpr>(End)) {
         PathPieces::iterator Next = std::next(I);
         if (Next != E) {
-          if (PathDiagnosticControlFlowPiece *NextCF =
-                dyn_cast<PathDiagnosticControlFlowPiece>(*Next)) {
+          if (auto *NextCF =
+                  dyn_cast<PathDiagnosticControlFlowPiece>(Next->get())) {
             NextCF->setStartLocation(CF->getStartLocation());
           }
         }
@@ -295,10 +294,10 @@ static void removeEdgesToDefaultInitializers(PathPieces &Pieces) {
 /// Farm generated functions.
 static void removePiecesWithInvalidLocations(PathPieces &Pieces) {
   for (PathPieces::iterator I = Pieces.begin(), E = Pieces.end(); I != E;) {
-    if (PathDiagnosticCallPiece *C = dyn_cast<PathDiagnosticCallPiece>(*I))
+    if (auto *C = dyn_cast<PathDiagnosticCallPiece>(I->get()))
       removePiecesWithInvalidLocations(C->path);
 
-    if (PathDiagnosticMacroPiece *M = dyn_cast<PathDiagnosticMacroPiece>(*I))
+    if (auto *M = dyn_cast<PathDiagnosticMacroPiece>(I->get()))
       removePiecesWithInvalidLocations(M->subPieces);
 
     if (!(*I)->getLocation().isValid() ||
@@ -518,11 +517,9 @@ static bool GenerateVisitorsOnlyPathDiagnostic(
 
   BugReport *R = PDB.getBugReport();
   while (const ExplodedNode *Pred = N->getFirstPred()) {
-    for (auto &V : visitors) {
+    for (auto &V : visitors)
       // Visit all the node pairs, but throw the path pieces away.
-      PathDiagnosticPiece *Piece = V->VisitNode(N, Pred, PDB, *R);
-      delete Piece;
-    }
+      V->VisitNode(N, Pred, PDB, *R);
 
     N = Pred;
   }
@@ -536,12 +533,11 @@ static bool GenerateVisitorsOnlyPathDiagnostic(
 typedef std::pair<PathDiagnosticCallPiece*, const ExplodedNode*> StackDiagPair;
 typedef SmallVector<StackDiagPair, 6> StackDiagVector;
 
-static void updateStackPiecesWithMessage(PathDiagnosticPiece *P,
+static void updateStackPiecesWithMessage(PathDiagnosticPiece &P,
                                          StackDiagVector &CallStack) {
   // If the piece contains a special message, add it to all the call
   // pieces on the active stack.
-  if (PathDiagnosticEventPiece *ep =
-        dyn_cast<PathDiagnosticEventPiece>(P)) {
+  if (PathDiagnosticEventPiece *ep = dyn_cast<PathDiagnosticEventPiece>(&P)) {
 
     if (ep->hasCallStackHint())
       for (StackDiagVector::iterator I = CallStack.begin(),
@@ -582,13 +578,13 @@ static bool GenerateMinimalPathDiagnostic(
 
     do {
       if (Optional<CallExitEnd> CE = P.getAs<CallExitEnd>()) {
-        PathDiagnosticCallPiece *C =
-            PathDiagnosticCallPiece::construct(N, *CE, SMgr);
+        auto C = PathDiagnosticCallPiece::construct(N, *CE, SMgr);
         // Record the mapping from call piece to LocationContext.
         LCM[&C->path] = CE->getCalleeContext();
-        PD.getActivePath().push_front(C);
-        PD.pushActivePath(&C->path);
-        CallStack.push_back(StackDiagPair(C, N));
+        auto *P = C.get();
+        PD.getActivePath().push_front(std::move(C));
+        PD.pushActivePath(&P->path);
+        CallStack.push_back(StackDiagPair(P, N));
         break;
       }
 
@@ -604,7 +600,7 @@ static bool GenerateMinimalPathDiagnostic(
         // a new PathDiagnosticCallPiece.
         PathDiagnosticCallPiece *C;
         if (VisitedEntireCall) {
-          C = cast<PathDiagnosticCallPiece>(PD.getActivePath().front());
+          C = cast<PathDiagnosticCallPiece>(PD.getActivePath().front().get());
         } else {
           const Decl *Caller = CE->getLocationContext()->getDecl();
           C = PathDiagnosticCallPiece::construct(PD.getActivePath(), Caller);
@@ -649,8 +645,9 @@ static bool GenerateMinimalPathDiagnostic(
 
           os << "Control jumps to line "
               << End.asLocation().getExpansionLineNumber();
-          PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-              Start, End, os.str()));
+          PD.getActivePath().push_front(
+              std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                               os.str()));
           break;
         }
 
@@ -701,14 +698,16 @@ static bool GenerateMinimalPathDiagnostic(
               break;
             }
             }
-            PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                Start, End, os.str()));
+            PD.getActivePath().push_front(
+                std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                                 os.str()));
           }
           else {
             os << "'Default' branch taken. ";
             const PathDiagnosticLocation &End = PDB.ExecutionContinues(os, N);
-            PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                Start, End, os.str()));
+            PD.getActivePath().push_front(
+                std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                                 os.str()));
           }
 
           break;
@@ -719,8 +718,9 @@ static bool GenerateMinimalPathDiagnostic(
           std::string sbuf;
           llvm::raw_string_ostream os(sbuf);
           PathDiagnosticLocation End = PDB.ExecutionContinues(os, N);
-          PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-              Start, End, os.str()));
+          PD.getActivePath().push_front(
+              std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                               os.str()));
           break;
         }
 
@@ -741,8 +741,9 @@ static bool GenerateMinimalPathDiagnostic(
           if (const Stmt *S = End.asStmt())
             End = PDB.getEnclosingStmtLocation(S);
 
-          PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-              Start, End, os.str()));
+          PD.getActivePath().push_front(
+              std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                               os.str()));
           break;
         }
 
@@ -764,15 +765,17 @@ static bool GenerateMinimalPathDiagnostic(
               PathDiagnosticLocation End(B->getLHS(), SMgr, LC);
               PathDiagnosticLocation Start =
                   PathDiagnosticLocation::createOperatorLoc(B, SMgr);
-              PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                  Start, End, os.str()));
+              PD.getActivePath().push_front(
+                  std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                                   os.str()));
             }
             else {
               os << "true";
               PathDiagnosticLocation Start(B->getLHS(), SMgr, LC);
               PathDiagnosticLocation End = PDB.ExecutionContinues(N);
-              PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                  Start, End, os.str()));
+              PD.getActivePath().push_front(
+                  std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                                   os.str()));
             }
           }
           else {
@@ -783,16 +786,18 @@ static bool GenerateMinimalPathDiagnostic(
               os << "false";
               PathDiagnosticLocation Start(B->getLHS(), SMgr, LC);
               PathDiagnosticLocation End = PDB.ExecutionContinues(N);
-              PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                  Start, End, os.str()));
+              PD.getActivePath().push_front(
+                  std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                                   os.str()));
             }
             else {
               os << "true";
               PathDiagnosticLocation End(B->getLHS(), SMgr, LC);
               PathDiagnosticLocation Start =
                   PathDiagnosticLocation::createOperatorLoc(B, SMgr);
-              PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                  Start, End, os.str()));
+              PD.getActivePath().push_front(
+                  std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                                   os.str()));
             }
           }
 
@@ -810,8 +815,9 @@ static bool GenerateMinimalPathDiagnostic(
             if (const Stmt *S = End.asStmt())
               End = PDB.getEnclosingStmtLocation(S);
 
-            PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                Start, End, os.str()));
+            PD.getActivePath().push_front(
+                std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                                 os.str()));
           }
           else {
             PathDiagnosticLocation End = PDB.ExecutionContinues(N);
@@ -819,8 +825,9 @@ static bool GenerateMinimalPathDiagnostic(
             if (const Stmt *S = End.asStmt())
               End = PDB.getEnclosingStmtLocation(S);
 
-            PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                Start, End, "Loop condition is false.  Exiting loop"));
+            PD.getActivePath().push_front(
+                std::make_shared<PathDiagnosticControlFlowPiece>(
+                    Start, End, "Loop condition is false.  Exiting loop"));
           }
 
           break;
@@ -837,16 +844,18 @@ static bool GenerateMinimalPathDiagnostic(
             if (const Stmt *S = End.asStmt())
               End = PDB.getEnclosingStmtLocation(S);
 
-            PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                Start, End, os.str()));
+            PD.getActivePath().push_front(
+                std::make_shared<PathDiagnosticControlFlowPiece>(Start, End,
+                                                                 os.str()));
           }
           else {
             PathDiagnosticLocation End = PDB.ExecutionContinues(N);
             if (const Stmt *S = End.asStmt())
               End = PDB.getEnclosingStmtLocation(S);
 
-            PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                Start, End, "Loop condition is true.  Entering loop body"));
+            PD.getActivePath().push_front(
+                std::make_shared<PathDiagnosticControlFlowPiece>(
+                    Start, End, "Loop condition is true.  Entering loop body"));
           }
 
           break;
@@ -859,11 +868,13 @@ static bool GenerateMinimalPathDiagnostic(
             End = PDB.getEnclosingStmtLocation(S);
 
           if (*(Src->succ_begin()+1) == Dst)
-            PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                Start, End, "Taking false branch"));
+            PD.getActivePath().push_front(
+                std::make_shared<PathDiagnosticControlFlowPiece>(
+                    Start, End, "Taking false branch"));
           else
-            PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(
-                Start, End, "Taking true branch"));
+            PD.getActivePath().push_front(
+                std::make_shared<PathDiagnosticControlFlowPiece>(
+                    Start, End, "Taking true branch"));
 
           break;
         }
@@ -875,9 +886,9 @@ static bool GenerateMinimalPathDiagnostic(
       // Add diagnostic pieces from custom visitors.
       BugReport *R = PDB.getBugReport();
       for (auto &V : visitors) {
-        if (PathDiagnosticPiece *p = V->VisitNode(N, NextNode, PDB, *R)) {
-          PD.getActivePath().push_front(p);
-          updateStackPiecesWithMessage(p, CallStack);
+        if (auto p = V->VisitNode(N, NextNode, PDB, *R)) {
+          updateStackPiecesWithMessage(*p, CallStack);
+          PD.getActivePath().push_front(std::move(p));
         }
       }
     }
@@ -1118,7 +1129,9 @@ void EdgeBuilder::rawAddEdge(PathDiagnosticLocation NewLoc) {
       PrevLocClean.asLocation().getExpansionLoc())
     return;
 
-  PD.getActivePath().push_front(new PathDiagnosticControlFlowPiece(NewLocClean, PrevLocClean));
+  PD.getActivePath().push_front(
+      std::make_shared<PathDiagnosticControlFlowPiece>(NewLocClean,
+                                                       PrevLocClean));
   PrevLoc = NewLoc;
 }
 
@@ -1423,16 +1436,16 @@ static bool GenerateExtensivePathDiagnostic(
                                                 N->getLocationContext());
         }
 
-        PathDiagnosticCallPiece *C =
-          PathDiagnosticCallPiece::construct(N, *CE, SM);
+        auto C = PathDiagnosticCallPiece::construct(N, *CE, SM);
         LCM[&C->path] = CE->getCalleeContext();
 
         EB.addEdge(C->callReturn, /*AlwaysAdd=*/true, /*IsPostJump=*/true);
         EB.flushLocations();
 
-        PD.getActivePath().push_front(C);
-        PD.pushActivePath(&C->path);
-        CallStack.push_back(StackDiagPair(C, N));
+        auto *P = C.get();
+        PD.getActivePath().push_front(std::move(C));
+        PD.pushActivePath(&P->path);
+        CallStack.push_back(StackDiagPair(P, N));
         break;
       }
 
@@ -1458,7 +1471,7 @@ static bool GenerateExtensivePathDiagnostic(
         // a new PathDiagnosticCallPiece.
         PathDiagnosticCallPiece *C;
         if (VisitedEntireCall) {
-          C = cast<PathDiagnosticCallPiece>(PD.getActivePath().front());
+          C = cast<PathDiagnosticCallPiece>(PD.getActivePath().front().get());
         } else {
           const Decl *Caller = CE->getLocationContext()->getDecl();
           C = PathDiagnosticCallPiece::construct(PD.getActivePath(), Caller);
@@ -1505,13 +1518,12 @@ static bool GenerateExtensivePathDiagnostic(
           else if (const WhileStmt *WS = dyn_cast<WhileStmt>(Loop))
             CS = dyn_cast<CompoundStmt>(WS->getBody());
 
-          PathDiagnosticEventPiece *p =
-            new PathDiagnosticEventPiece(L,
-                                        "Looping back to the head of the loop");
+          auto p = std::make_shared<PathDiagnosticEventPiece>(
+              L, "Looping back to the head of the loop");
           p->setPrunable(true);
 
           EB.addEdge(p->getLocation(), true);
-          PD.getActivePath().push_front(p);
+          PD.getActivePath().push_front(std::move(p));
 
           if (CS) {
             PathDiagnosticLocation BL =
@@ -1533,12 +1545,12 @@ static bool GenerateExtensivePathDiagnostic(
                                               N),
                             Term)) {
             PathDiagnosticLocation L(Term, SM, PDB.LC);
-            PathDiagnosticEventPiece *PE =
-                new PathDiagnosticEventPiece(L, "Loop body executed 0 times");
+            auto PE = std::make_shared<PathDiagnosticEventPiece>(
+                L, "Loop body executed 0 times");
             PE->setPrunable(true);
 
             EB.addEdge(PE->getLocation(), true);
-            PD.getActivePath().push_front(PE);
+            PD.getActivePath().push_front(std::move(PE));
           }
 
           // In any case, add the terminator as the current statement
@@ -1573,11 +1585,11 @@ static bool GenerateExtensivePathDiagnostic(
     // Add pieces from custom visitors.
     BugReport *R = PDB.getBugReport();
     for (auto &V : visitors) {
-      if (PathDiagnosticPiece *p = V->VisitNode(N, NextNode, PDB, *R)) {
+      if (auto p = V->VisitNode(N, NextNode, PDB, *R)) {
         const PathDiagnosticLocation &Loc = p->getLocation();
         EB.addEdge(Loc, true);
-        PD.getActivePath().push_front(p);
-        updateStackPiecesWithMessage(p, CallStack);
+        updateStackPiecesWithMessage(*p, CallStack);
+        PD.getActivePath().push_front(std::move(p));
 
         if (const Stmt *S = Loc.asStmt())
           EB.addExtendedContext(PDB.getEnclosingStmtLocation(S).asStmt());
@@ -1610,8 +1622,8 @@ static void addEdgeToPath(PathPieces &path,
   if (NewLoc.asStmt() && NewLoc.asStmt() == PrevLoc.asStmt())
     return;
 
-  path.push_front(new PathDiagnosticControlFlowPiece(NewLoc,
-                                                     PrevLoc));
+  path.push_front(
+      std::make_shared<PathDiagnosticControlFlowPiece>(NewLoc, PrevLoc));
   PrevLoc = NewLoc;
 }
 
@@ -1678,7 +1690,7 @@ static bool GenerateAlternateExtensivePathDiagnostic(
           // Since we just transferred the path over to the call piece,
           // reset the mapping from active to location context.
           assert(PD.getActivePath().size() == 1 &&
-                 PD.getActivePath().front() == C);
+                 PD.getActivePath().front().get() == C);
           LCM[&PD.getActivePath()] = nullptr;
 
           // Record the location context mapping for the path within
@@ -1729,20 +1741,20 @@ static bool GenerateAlternateExtensivePathDiagnostic(
 
         // We are descending into a call (backwards).  Construct
         // a new call piece to contain the path pieces for that call.
-        PathDiagnosticCallPiece *C =
-          PathDiagnosticCallPiece::construct(N, *CE, SM);
+        auto C = PathDiagnosticCallPiece::construct(N, *CE, SM);
 
         // Record the location context for this call piece.
         LCM[&C->path] = CE->getCalleeContext();
 
         // Add the edge to the return site.
         addEdgeToPath(PD.getActivePath(), PrevLoc, C->callReturn, PDB.LC);
-        PD.getActivePath().push_front(C);
+        auto *P = C.get();
+        PD.getActivePath().push_front(std::move(C));
         PrevLoc.invalidate();
 
         // Make the contents of the call the active path for now.
-        PD.pushActivePath(&C->path);
-        CallStack.push_back(StackDiagPair(C, N));
+        PD.pushActivePath(&P->path);
+        CallStack.push_back(StackDiagPair(P, N));
         break;
       }
 
@@ -1797,13 +1809,13 @@ static bool GenerateAlternateExtensivePathDiagnostic(
           }
           // do-while statements are explicitly excluded here
 
-          PathDiagnosticEventPiece *p =
-            new PathDiagnosticEventPiece(L, "Looping back to the head "
-                                            "of the loop");
+          auto p = std::make_shared<PathDiagnosticEventPiece>(
+              L, "Looping back to the head "
+                 "of the loop");
           p->setPrunable(true);
 
           addEdgeToPath(PD.getActivePath(), PrevLoc, p->getLocation(), PDB.LC);
-          PD.getActivePath().push_front(p);
+          PD.getActivePath().push_front(std::move(p));
 
           if (const CompoundStmt *CS = dyn_cast_or_null<CompoundStmt>(Body)) {
             addEdgeToPath(PD.getActivePath(), PrevLoc,
@@ -1841,12 +1853,11 @@ static bool GenerateAlternateExtensivePathDiagnostic(
 
             if (str) {
               PathDiagnosticLocation L(TermCond ? TermCond : Term, SM, PDB.LC);
-              PathDiagnosticEventPiece *PE =
-                new PathDiagnosticEventPiece(L, str);
+              auto PE = std::make_shared<PathDiagnosticEventPiece>(L, str);
               PE->setPrunable(true);
               addEdgeToPath(PD.getActivePath(), PrevLoc,
                             PE->getLocation(), PDB.LC);
-              PD.getActivePath().push_front(PE);
+              PD.getActivePath().push_front(std::move(PE));
             }
           } else if (isa<BreakStmt>(Term) || isa<ContinueStmt>(Term) ||
                      isa<GotoStmt>(Term)) {
@@ -1863,10 +1874,10 @@ static bool GenerateAlternateExtensivePathDiagnostic(
 
     // Add pieces from custom visitors.
     for (auto &V : visitors) {
-      if (PathDiagnosticPiece *p = V->VisitNode(N, NextNode, PDB, *report)) {
+      if (auto p = V->VisitNode(N, NextNode, PDB, *report)) {
         addEdgeToPath(PD.getActivePath(), PrevLoc, p->getLocation(), PDB.LC);
-        PD.getActivePath().push_front(p);
-        updateStackPiecesWithMessage(p, CallStack);
+        updateStackPiecesWithMessage(*p, CallStack);
+        PD.getActivePath().push_front(std::move(p));
       }
     }
   }
@@ -1973,7 +1984,7 @@ static void addContextEdges(PathPieces &pieces, SourceManager &SM,
   for (PathPieces::iterator I = pieces.begin(), E = Prev; I != E;
        Prev = I, ++I) {
     PathDiagnosticControlFlowPiece *Piece =
-      dyn_cast<PathDiagnosticControlFlowPiece>(*I);
+        dyn_cast<PathDiagnosticControlFlowPiece>(I->get());
 
     if (!Piece)
       continue;
@@ -2014,8 +2025,7 @@ static void addContextEdges(PathPieces &pieces, SourceManager &SM,
       // Try to extend the previous edge if it's at the same level as the source
       // context.
       if (Prev != E) {
-        PathDiagnosticControlFlowPiece *PrevPiece =
-          dyn_cast<PathDiagnosticControlFlowPiece>(*Prev);
+        auto *PrevPiece = dyn_cast<PathDiagnosticControlFlowPiece>(Prev->get());
 
         if (PrevPiece) {
           if (const Stmt *PrevSrc = getLocStmt(PrevPiece->getStartLocation())) {
@@ -2031,8 +2041,10 @@ static void addContextEdges(PathPieces &pieces, SourceManager &SM,
       // Otherwise, split the current edge into a context edge and a
       // subexpression edge. Note that the context statement may itself have
       // context.
-      Piece = new PathDiagnosticControlFlowPiece(SrcLoc, DstContext);
-      I = pieces.insert(I, Piece);
+      auto P =
+          std::make_shared<PathDiagnosticControlFlowPiece>(SrcLoc, DstContext);
+      Piece = P.get();
+      I = pieces.insert(I, std::move(P));
     }
   }
 }
@@ -2051,8 +2063,7 @@ static void addContextEdges(PathPieces &pieces, SourceManager &SM,
 static void simplifySimpleBranches(PathPieces &pieces) {
   for (PathPieces::iterator I = pieces.begin(), E = pieces.end(); I != E; ++I) {
 
-    PathDiagnosticControlFlowPiece *PieceI =
-      dyn_cast<PathDiagnosticControlFlowPiece>(*I);
+    auto *PieceI = dyn_cast<PathDiagnosticControlFlowPiece>(I->get());
 
     if (!PieceI)
       continue;
@@ -2073,7 +2084,7 @@ static void simplifySimpleBranches(PathPieces &pieces) {
       if (NextI == E)
         break;
 
-      PathDiagnosticEventPiece *EV = dyn_cast<PathDiagnosticEventPiece>(*NextI);
+      auto *EV = dyn_cast<PathDiagnosticEventPiece>(NextI->get());
       if (EV) {
         StringRef S = EV->getString();
         if (S == StrEnteringLoop || S == StrLoopBodyZero ||
@@ -2084,7 +2095,7 @@ static void simplifySimpleBranches(PathPieces &pieces) {
         break;
       }
 
-      PieceNextI = dyn_cast<PathDiagnosticControlFlowPiece>(*NextI);
+      PieceNextI = dyn_cast<PathDiagnosticControlFlowPiece>(NextI->get());
       break;
     }
 
@@ -2176,7 +2187,7 @@ static void removeContextCycles(PathPieces &Path, SourceManager &SM,
   for (PathPieces::iterator I = Path.begin(), E = Path.end(); I != E; ) {
     // Pattern match the current piece and its successor.
     PathDiagnosticControlFlowPiece *PieceI =
-      dyn_cast<PathDiagnosticControlFlowPiece>(*I);
+        dyn_cast<PathDiagnosticControlFlowPiece>(I->get());
 
     if (!PieceI) {
       ++I;
@@ -2191,14 +2202,14 @@ static void removeContextCycles(PathPieces &Path, SourceManager &SM,
       break;
 
     PathDiagnosticControlFlowPiece *PieceNextI =
-      dyn_cast<PathDiagnosticControlFlowPiece>(*NextI);
+        dyn_cast<PathDiagnosticControlFlowPiece>(NextI->get());
 
     if (!PieceNextI) {
-      if (isa<PathDiagnosticEventPiece>(*NextI)) {
+      if (isa<PathDiagnosticEventPiece>(NextI->get())) {
         ++NextI;
         if (NextI == E)
           break;
-        PieceNextI = dyn_cast<PathDiagnosticControlFlowPiece>(*NextI);
+        PieceNextI = dyn_cast<PathDiagnosticControlFlowPiece>(NextI->get());
       }
 
       if (!PieceNextI) {
@@ -2251,8 +2262,7 @@ static void removePunyEdges(PathPieces &path,
 
     erased = false;
 
-    PathDiagnosticControlFlowPiece *PieceI =
-      dyn_cast<PathDiagnosticControlFlowPiece>(*I);
+    auto *PieceI = dyn_cast<PathDiagnosticControlFlowPiece>(I->get());
 
     if (!PieceI)
       continue;
@@ -2299,8 +2309,7 @@ static void removePunyEdges(PathPieces &path,
 
 static void removeIdenticalEvents(PathPieces &path) {
   for (PathPieces::iterator I = path.begin(), E = path.end(); I != E; ++I) {
-    PathDiagnosticEventPiece *PieceI =
-      dyn_cast<PathDiagnosticEventPiece>(*I);
+    auto *PieceI = dyn_cast<PathDiagnosticEventPiece>(I->get());
 
     if (!PieceI)
       continue;
@@ -2309,8 +2318,7 @@ static void removeIdenticalEvents(PathPieces &path) {
     if (NextI == E)
       return;
 
-    PathDiagnosticEventPiece *PieceNextI =
-      dyn_cast<PathDiagnosticEventPiece>(*NextI);
+    auto *PieceNextI = dyn_cast<PathDiagnosticEventPiece>(NextI->get());
 
     if (!PieceNextI)
       continue;
@@ -2332,7 +2340,7 @@ static bool optimizeEdges(PathPieces &path, SourceManager &SM,
 
   for (PathPieces::iterator I = path.begin(), E = path.end(); I != E; ) {
     // Optimize subpaths.
-    if (PathDiagnosticCallPiece *CallI = dyn_cast<PathDiagnosticCallPiece>(*I)){
+    if (auto *CallI = dyn_cast<PathDiagnosticCallPiece>(I->get())) {
       // Record the fact that a call has been optimized so we only do the
       // effort once.
       if (!OCS.count(CallI)) {
@@ -2344,8 +2352,7 @@ static bool optimizeEdges(PathPieces &path, SourceManager &SM,
     }
 
     // Pattern match the current piece and its successor.
-    PathDiagnosticControlFlowPiece *PieceI =
-      dyn_cast<PathDiagnosticControlFlowPiece>(*I);
+    auto *PieceI = dyn_cast<PathDiagnosticControlFlowPiece>(I->get());
 
     if (!PieceI) {
       ++I;
@@ -2361,8 +2368,7 @@ static bool optimizeEdges(PathPieces &path, SourceManager &SM,
     if (NextI == E)
       break;
 
-    PathDiagnosticControlFlowPiece *PieceNextI =
-      dyn_cast<PathDiagnosticControlFlowPiece>(*NextI);
+    auto *PieceNextI = dyn_cast<PathDiagnosticControlFlowPiece>(NextI->get());
 
     if (!PieceNextI) {
       ++I;
@@ -2511,8 +2517,8 @@ static bool optimizeEdges(PathPieces &path, SourceManager &SM,
 static void dropFunctionEntryEdge(PathPieces &Path,
                                   LocationContextMap &LCM,
                                   SourceManager &SM) {
-  const PathDiagnosticControlFlowPiece *FirstEdge =
-    dyn_cast<PathDiagnosticControlFlowPiece>(Path.front());
+  const auto *FirstEdge =
+      dyn_cast<PathDiagnosticControlFlowPiece>(Path.front().get());
   if (!FirstEdge)
     return;
 
@@ -2967,11 +2973,11 @@ bool TrimmedGraph::popNextReportGraph(ReportGraph &GraphWrapper) {
 /// CompactPathDiagnostic - This function postprocesses a PathDiagnostic object
 ///  and collapses PathDiagosticPieces that are expanded by macros.
 static void CompactPathDiagnostic(PathPieces &path, const SourceManager& SM) {
-  typedef std::vector<std::pair<IntrusiveRefCntPtr<PathDiagnosticMacroPiece>,
-                                SourceLocation> > MacroStackTy;
+  typedef std::vector<
+      std::pair<std::shared_ptr<PathDiagnosticMacroPiece>, SourceLocation>>
+      MacroStackTy;
 
-  typedef std::vector<IntrusiveRefCntPtr<PathDiagnosticPiece> >
-          PiecesTy;
+  typedef std::vector<std::shared_ptr<PathDiagnosticPiece>> PiecesTy;
 
   MacroStackTy MacroStack;
   PiecesTy Pieces;
@@ -2979,10 +2985,10 @@ static void CompactPathDiagnostic(PathPieces &path, const SourceManager& SM) {
   for (PathPieces::const_iterator I = path.begin(), E = path.end();
        I!=E; ++I) {
 
-    PathDiagnosticPiece *piece = I->get();
+    auto &piece = *I;
 
     // Recursively compact calls.
-    if (PathDiagnosticCallPiece *call=dyn_cast<PathDiagnosticCallPiece>(piece)){
+    if (auto *call = dyn_cast<PathDiagnosticCallPiece>(&*piece)) {
       CompactPathDiagnostic(call->path, SM);
     }
 
@@ -3011,7 +3017,7 @@ static void CompactPathDiagnostic(PathPieces &path, const SourceManager& SM) {
 
     // We aren't in the same group.  Are we descending into a new macro
     // or are part of an old one?
-    IntrusiveRefCntPtr<PathDiagnosticMacroPiece> MacroGroup;
+    std::shared_ptr<PathDiagnosticMacroPiece> MacroGroup;
 
     SourceLocation ParentInstantiationLoc = InstantiationLoc.isMacroID() ?
                                           SM.getExpansionLoc(Loc) :
@@ -3034,8 +3040,7 @@ static void CompactPathDiagnostic(PathPieces &path, const SourceManager& SM) {
 
     if (!MacroGroup || ParentInstantiationLoc == MacroStack.back().second) {
       // Create a new macro group and add it to the stack.
-      PathDiagnosticMacroPiece *NewGroup =
-        new PathDiagnosticMacroPiece(
+      auto NewGroup = std::make_shared<PathDiagnosticMacroPiece>(
           PathDiagnosticLocation::createSingleLocation(piece->getLocation()));
 
       if (MacroGroup)
@@ -3477,13 +3482,12 @@ void BugReporter::FlushReport(BugReport *exampleReport,
     for (auto I = exampleReport->getNotes().rbegin(),
               E = exampleReport->getNotes().rend(); I != E; ++I) {
       PathDiagnosticNotePiece *Piece = I->get();
-      PathDiagnosticEventPiece *ConvertedPiece =
-          new PathDiagnosticEventPiece(Piece->getLocation(),
-                                       Piece->getString());
+      auto ConvertedPiece = std::make_shared<PathDiagnosticEventPiece>(
+          Piece->getLocation(), Piece->getString());
       for (const auto &R: Piece->getRanges())
         ConvertedPiece->addRange(R);
 
-      Pieces.push_front(ConvertedPiece);
+      Pieces.push_front(std::move(ConvertedPiece));
     }
   } else {
     for (auto I = exampleReport->getNotes().rbegin(),
diff --git a/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index 7f20f0d7703e..c3c3f2ff76ec 100644
--- a/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -229,10 +229,9 @@ public:
     return Options.shouldAvoidSuppressingNullArgumentPaths();
   }
 
-  PathDiagnosticPiece *visitNodeInitial(const ExplodedNode *N,
-                                        const ExplodedNode *PrevN,
-                                        BugReporterContext &BRC,
-                                        BugReport &BR) {
+  std::shared_ptr<PathDiagnosticPiece>
+  visitNodeInitial(const ExplodedNode *N, const ExplodedNode *PrevN,
+                   BugReporterContext &BRC, BugReport &BR) {
     // Only print a message at the interesting return statement.
     if (N->getLocationContext() != StackFrame)
       return nullptr;
@@ -328,13 +327,12 @@ public:
     if (!L.isValid() || !L.asLocation().isValid())
       return nullptr;
 
-    return new PathDiagnosticEventPiece(L, Out.str());
+    return std::make_shared<PathDiagnosticEventPiece>(L, Out.str());
   }
 
-  PathDiagnosticPiece *visitNodeMaybeUnsuppress(const ExplodedNode *N,
-                                                const ExplodedNode *PrevN,
-                                                BugReporterContext &BRC,
-                                                BugReport &BR) {
+  std::shared_ptr<PathDiagnosticPiece>
+  visitNodeMaybeUnsuppress(const ExplodedNode *N, const ExplodedNode *PrevN,
+                           BugReporterContext &BRC, BugReport &BR) {
 #ifndef NDEBUG
     ExprEngine &Eng = BRC.getBugReporter().getEngine();
     AnalyzerOptions &Options = Eng.getAnalysisManager().options;
@@ -384,10 +382,10 @@ public:
     return nullptr;
   }
 
-  PathDiagnosticPiece *VisitNode(const ExplodedNode *N,
-                                 const ExplodedNode *PrevN,
-                                 BugReporterContext &BRC,
-                                 BugReport &BR) override {
+  std::shared_ptr<PathDiagnosticPiece> VisitNode(const ExplodedNode *N,
+                                                 const ExplodedNode *PrevN,
+                                                 BugReporterContext &BRC,
+                                                 BugReport &BR) override {
     switch (Mode) {
     case Initial:
       return visitNodeInitial(N, PrevN, BRC, BR);
@@ -448,10 +446,10 @@ static bool isInitializationOfVar(const ExplodedNode *N, const VarRegion *VR) {
   return FrameSpace->getStackFrame() == LCtx->getCurrentStackFrame();
 }
 
-PathDiagnosticPiece *FindLastStoreBRVisitor::VisitNode(const ExplodedNode *Succ,
-                                                       const ExplodedNode *Pred,
-                                                       BugReporterContext &BRC,
-                                                       BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+FindLastStoreBRVisitor::VisitNode(const ExplodedNode *Succ,
+                                  const ExplodedNode *Pred,
+                                  BugReporterContext &BRC, BugReport &BR) {
 
   if (Satisfied)
     return nullptr;
@@ -706,7 +704,7 @@ PathDiagnosticPiece *FindLastStoreBRVisitor::VisitNode(const ExplodedNode *Succ,
   if (!L.isValid() || !L.asLocation().isValid())
     return nullptr;
 
-  return new PathDiagnosticEventPiece(L, os.str());
+  return std::make_shared<PathDiagnosticEventPiece>(L, os.str());
 }
 
 void TrackConstraintBRVisitor::Profile(llvm::FoldingSetNodeID &ID) const {
@@ -728,11 +726,10 @@ bool TrackConstraintBRVisitor::isUnderconstrained(const ExplodedNode *N) const {
   return (bool)N->getState()->assume(Constraint, !Assumption);
 }
 
-PathDiagnosticPiece *
+std::shared_ptr<PathDiagnosticPiece>
 TrackConstraintBRVisitor::VisitNode(const ExplodedNode *N,
                                     const ExplodedNode *PrevN,
-                                    BugReporterContext &BRC,
-                                    BugReport &BR) {
+                                    BugReporterContext &BRC, BugReport &BR) {
   if (IsSatisfied)
     return nullptr;
 
@@ -775,9 +772,9 @@ TrackConstraintBRVisitor::VisitNode(const ExplodedNode *N,
     if (!L.isValid())
       return nullptr;
 
-    PathDiagnosticEventPiece *X = new PathDiagnosticEventPiece(L, os.str());
+    auto X = std::make_shared<PathDiagnosticEventPiece>(L, os.str());
     X->setTag(getTag());
-    return X;
+    return std::move(X);
   }
 
   return nullptr;
@@ -808,7 +805,7 @@ const char *SuppressInlineDefensiveChecksVisitor::getTag() {
   return "IDCVisitor";
 }
 
-PathDiagnosticPiece *
+std::shared_ptr<PathDiagnosticPiece>
 SuppressInlineDefensiveChecksVisitor::VisitNode(const ExplodedNode *Succ,
                                                 const ExplodedNode *Pred,
                                                 BugReporterContext &BRC,
@@ -1121,10 +1118,10 @@ const Expr *NilReceiverBRVisitor::getNilReceiver(const Stmt *S,
   return nullptr;
 }
 
-PathDiagnosticPiece *NilReceiverBRVisitor::VisitNode(const ExplodedNode *N,
-                                                     const ExplodedNode *PrevN,
-                                                     BugReporterContext &BRC,
-                                                     BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+NilReceiverBRVisitor::VisitNode(const ExplodedNode *N,
+                                const ExplodedNode *PrevN,
+                                BugReporterContext &BRC, BugReport &BR) {
   Optional<PreStmt> P = N->getLocationAs<PreStmt>();
   if (!P)
     return nullptr;
@@ -1155,7 +1152,7 @@ PathDiagnosticPiece *NilReceiverBRVisitor::VisitNode(const ExplodedNode *N,
   // Issue a message saying that the method was skipped.
   PathDiagnosticLocation L(Receiver, BRC.getSourceManager(),
                                      N->getLocationContext());
-  return new PathDiagnosticEventPiece(L, OS.str());
+  return std::make_shared<PathDiagnosticEventPiece>(L, OS.str());
 }
 
 // Registers every VarDecl inside a Stmt with a last store visitor.
@@ -1204,23 +1201,22 @@ const char *ConditionBRVisitor::getTag() {
   return "ConditionBRVisitor";
 }
 
-PathDiagnosticPiece *ConditionBRVisitor::VisitNode(const ExplodedNode *N,
-                                                   const ExplodedNode *Prev,
-                                                   BugReporterContext &BRC,
-                                                   BugReport &BR) {
-  PathDiagnosticPiece *piece = VisitNodeImpl(N, Prev, BRC, BR);
+std::shared_ptr<PathDiagnosticPiece>
+ConditionBRVisitor::VisitNode(const ExplodedNode *N, const ExplodedNode *Prev,
+                              BugReporterContext &BRC, BugReport &BR) {
+  auto piece = VisitNodeImpl(N, Prev, BRC, BR);
   if (piece) {
     piece->setTag(getTag());
-    if (PathDiagnosticEventPiece *ev=dyn_cast<PathDiagnosticEventPiece>(piece))
+    if (auto *ev = dyn_cast<PathDiagnosticEventPiece>(piece.get()))
       ev->setPrunable(true, /* override */ false);
   }
   return piece;
 }
 
-PathDiagnosticPiece *ConditionBRVisitor::VisitNodeImpl(const ExplodedNode *N,
-                                                       const ExplodedNode *Prev,
-                                                       BugReporterContext &BRC,
-                                                       BugReport &BR) {
+std::shared_ptr<PathDiagnosticPiece>
+ConditionBRVisitor::VisitNodeImpl(const ExplodedNode *N,
+                                  const ExplodedNode *Prev,
+                                  BugReporterContext &BRC, BugReport &BR) {
 
   ProgramPoint progPoint = N->getLocation();
   ProgramStateRef CurrentState = N->getState();
@@ -1263,13 +1259,9 @@ PathDiagnosticPiece *ConditionBRVisitor::VisitNodeImpl(const ExplodedNode *N,
   return nullptr;
 }
 
-PathDiagnosticPiece *
-ConditionBRVisitor::VisitTerminator(const Stmt *Term,
-                                    const ExplodedNode *N,
-                                    const CFGBlock *srcBlk,
-                                    const CFGBlock *dstBlk,
-                                    BugReport &R,
-                                    BugReporterContext &BRC) {
+std::shared_ptr<PathDiagnosticPiece> ConditionBRVisitor::VisitTerminator(
+    const Stmt *Term, const ExplodedNode *N, const CFGBlock *srcBlk,
+    const CFGBlock *dstBlk, BugReport &R, BugReporterContext &BRC) {
   const Expr *Cond = nullptr;
 
   // In the code below, Term is a CFG terminator and Cond is a branch condition
@@ -1322,11 +1314,9 @@ ConditionBRVisitor::VisitTerminator(const Stmt *Term,
   return VisitTrueTest(Cond, tookTrue, BRC, R, N);
 }
 
-PathDiagnosticPiece *
-ConditionBRVisitor::VisitTrueTest(const Expr *Cond,
-                                  bool tookTrue,
-                                  BugReporterContext &BRC,
-                                  BugReport &R,
+std::shared_ptr<PathDiagnosticPiece>
+ConditionBRVisitor::VisitTrueTest(const Expr *Cond, bool tookTrue,
+                                  BugReporterContext &BRC, BugReport &R,
                                   const ExplodedNode *N) {
   // These will be modified in code below, but we need to preserve the original
   //  values in case we want to throw the generic message.
@@ -1339,13 +1329,13 @@ ConditionBRVisitor::VisitTrueTest(const Expr *Cond,
       default:
         break;
       case Stmt::BinaryOperatorClass:
-        if (PathDiagnosticPiece *P = VisitTrueTest(
-                Cond, cast<BinaryOperator>(CondTmp), tookTrueTmp, BRC, R, N))
+        if (auto P = VisitTrueTest(Cond, cast<BinaryOperator>(CondTmp),
+                                   tookTrueTmp, BRC, R, N))
           return P;
         break;
       case Stmt::DeclRefExprClass:
-        if (PathDiagnosticPiece *P = VisitTrueTest(
-                Cond, cast<DeclRefExpr>(CondTmp), tookTrueTmp, BRC, R, N))
+        if (auto P = VisitTrueTest(Cond, cast<DeclRefExpr>(CondTmp),
+                                   tookTrueTmp, BRC, R, N))
           return P;
         break;
       case Stmt::UnaryOperatorClass: {
@@ -1368,9 +1358,8 @@ ConditionBRVisitor::VisitTrueTest(const Expr *Cond,
   if (!Loc.isValid() || !Loc.asLocation().isValid())
     return nullptr;
 
-  PathDiagnosticEventPiece *Event = new PathDiagnosticEventPiece(
+  return std::make_shared<PathDiagnosticEventPiece>(
       Loc, tookTrue ? GenericTrueMessage : GenericFalseMessage);
-  return Event;
 }
 
 bool ConditionBRVisitor::patternMatch(const Expr *Ex,
@@ -1470,13 +1459,10 @@ bool ConditionBRVisitor::patternMatch(const Expr *Ex,
   return false;
 }
 
-PathDiagnosticPiece *
-ConditionBRVisitor::VisitTrueTest(const Expr *Cond,
-                                  const BinaryOperator *BExpr,
-                                  const bool tookTrue,
-                                  BugReporterContext &BRC,
-                                  BugReport &R,
-                                  const ExplodedNode *N) {
+std::shared_ptr<PathDiagnosticPiece>
+ConditionBRVisitor::VisitTrueTest(const Expr *Cond, const BinaryOperator *BExpr,
+                                  const bool tookTrue, BugReporterContext &BRC,
+                                  BugReport &R, const ExplodedNode *N) {
 
   bool shouldInvert = false;
   Optional<bool> shouldPrune;
@@ -1549,20 +1535,15 @@ ConditionBRVisitor::VisitTrueTest(const Expr *Cond,
   Out << (shouldInvert ? LhsString : RhsString);
   const LocationContext *LCtx = N->getLocationContext();
   PathDiagnosticLocation Loc(Cond, BRC.getSourceManager(), LCtx);
-  PathDiagnosticEventPiece *event =
-    new PathDiagnosticEventPiece(Loc, Out.str());
+  auto event = std::make_shared<PathDiagnosticEventPiece>(Loc, Out.str());
   if (shouldPrune.hasValue())
     event->setPrunable(shouldPrune.getValue());
   return event;
 }
 
-PathDiagnosticPiece *
-ConditionBRVisitor::VisitConditionVariable(StringRef LhsString,
-                                           const Expr *CondVarExpr,
-                                           const bool tookTrue,
-                                           BugReporterContext &BRC,
-                                           BugReport &report,
-                                           const ExplodedNode *N) {
+std::shared_ptr<PathDiagnosticPiece> ConditionBRVisitor::VisitConditionVariable(
+    StringRef LhsString, const Expr *CondVarExpr, const bool tookTrue,
+    BugReporterContext &BRC, BugReport &report, const ExplodedNode *N) {
   // FIXME: If there's already a constraint tracker for this variable,
   // we shouldn't emit anything here (c.f. the double note in
   // test/Analysis/inlining/path-notes.c)
@@ -1585,8 +1566,7 @@ ConditionBRVisitor::VisitConditionVariable(StringRef LhsString,
 
   const LocationContext *LCtx = N->getLocationContext();
   PathDiagnosticLocation Loc(CondVarExpr, BRC.getSourceManager(), LCtx);
-  PathDiagnosticEventPiece *event =
-    new PathDiagnosticEventPiece(Loc, Out.str());
+  auto event = std::make_shared<PathDiagnosticEventPiece>(Loc, Out.str());
 
   if (const DeclRefExpr *DR = dyn_cast<DeclRefExpr>(CondVarExpr)) {
     if (const VarDecl *VD = dyn_cast<VarDecl>(DR->getDecl())) {
@@ -1601,13 +1581,10 @@ ConditionBRVisitor::VisitConditionVariable(StringRef LhsString,
   return event;
 }
 
-PathDiagnosticPiece *
-ConditionBRVisitor::VisitTrueTest(const Expr *Cond,
-                                  const DeclRefExpr *DR,
-                                  const bool tookTrue,
-                                  BugReporterContext &BRC,
-                                  BugReport &report,
-                                  const ExplodedNode *N) {
+std::shared_ptr<PathDiagnosticPiece>
+ConditionBRVisitor::VisitTrueTest(const Expr *Cond, const DeclRefExpr *DR,
+                                  const bool tookTrue, BugReporterContext &BRC,
+                                  BugReport &report, const ExplodedNode *N) {
 
   const VarDecl *VD = dyn_cast<VarDecl>(DR->getDecl());
   if (!VD)
@@ -1631,8 +1608,7 @@ ConditionBRVisitor::VisitTrueTest(const Expr *Cond,
 
   const LocationContext *LCtx = N->getLocationContext();
   PathDiagnosticLocation Loc(Cond, BRC.getSourceManager(), LCtx);
-  PathDiagnosticEventPiece *event =
-    new PathDiagnosticEventPiece(Loc, Out.str());
+  auto event = std::make_shared<PathDiagnosticEventPiece>(Loc, Out.str());
 
   const ProgramState *state = N->getState().get();
   if (const MemRegion *R = state->getLValue(VD, LCtx).getAsRegion()) {
@@ -1644,7 +1620,7 @@ ConditionBRVisitor::VisitTrueTest(const Expr *Cond,
         event->setPrunable(false);
     }
   }
-  return event;
+  return std::move(event);
 }
 
 const char *const ConditionBRVisitor::GenericTrueMessage =
@@ -1746,11 +1722,10 @@ LikelyFalsePositiveSuppressionBRVisitor::getEndPath(BugReporterContext &BRC,
   return nullptr;
 }
 
-PathDiagnosticPiece *
+std::shared_ptr<PathDiagnosticPiece>
 UndefOrNullArgVisitor::VisitNode(const ExplodedNode *N,
-                                  const ExplodedNode *PrevN,
-                                  BugReporterContext &BRC,
-                                  BugReport &BR) {
+                                 const ExplodedNode *PrevN,
+                                 BugReporterContext &BRC, BugReport &BR) {
 
   ProgramStateRef State = N->getState();
   ProgramPoint ProgLoc = N->getLocation();
@@ -1800,7 +1775,7 @@ UndefOrNullArgVisitor::VisitNode(const ExplodedNode *N,
   return nullptr;
 }
 
-PathDiagnosticPiece *
+std::shared_ptr<PathDiagnosticPiece>
 CXXSelfAssignmentBRVisitor::VisitNode(const ExplodedNode *Succ,
                                       const ExplodedNode *Pred,
                                       BugReporterContext &BRC, BugReport &BR) {
@@ -1847,8 +1822,8 @@ CXXSelfAssignmentBRVisitor::VisitNode(const ExplodedNode *Succ,
   Out << "Assuming " << Met->getParamDecl(0)->getName() <<
     ((Param == This) ? " == " : " != ") << "*this";
 
-  auto *Piece = new PathDiagnosticEventPiece(L, Out.str());
+  auto Piece = std::make_shared<PathDiagnosticEventPiece>(L, Out.str());
   Piece->addRange(Met->getSourceRange());
 
-  return Piece;
+  return std::move(Piece);
 }
diff --git a/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
index f157c3dd6ce2..f0f6dd2e43e7 100644
--- a/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
+++ b/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
@@ -156,8 +156,8 @@ void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D,
   unsigned TotalPieces = path.size();
   unsigned TotalNotePieces =
       std::count_if(path.begin(), path.end(),
-                    [](const IntrusiveRefCntPtr<PathDiagnosticPiece> &p) {
-                      return isa<PathDiagnosticNotePiece>(p.get());
+                    [](const std::shared_ptr<PathDiagnosticPiece> &p) {
+                      return isa<PathDiagnosticNotePiece>(*p);
                     });
 
   unsigned TotalRegularPieces = TotalPieces - TotalNotePieces;
@@ -615,12 +615,13 @@ unsigned HTMLDiagnostics::ProcessMacroPiece(raw_ostream &os,
         I!=E; ++I) {
 
     if (const PathDiagnosticMacroPiece *MP =
-          dyn_cast<PathDiagnosticMacroPiece>(*I)) {
+            dyn_cast<PathDiagnosticMacroPiece>(I->get())) {
       num = ProcessMacroPiece(os, *MP, num);
       continue;
     }
 
-    if (PathDiagnosticEventPiece *EP = dyn_cast<PathDiagnosticEventPiece>(*I)) {
+    if (PathDiagnosticEventPiece *EP =
+            dyn_cast<PathDiagnosticEventPiece>(I->get())) {
       os << "<div class=\"msg msgEvent\" style=\"width:94%; "
             "margin-left:5px\">"
             "<table class=\"msgT\"><tr>"
diff --git a/lib/StaticAnalyzer/Core/PathDiagnostic.cpp b/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
index 5675cb2026f0..7c5ee3b25944 100644
--- a/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
+++ b/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
@@ -29,11 +29,10 @@ using namespace clang;
 using namespace ento;
 
 bool PathDiagnosticMacroPiece::containsEvent() const {
-  for (PathPieces::const_iterator I = subPieces.begin(), E = subPieces.end();
-       I!=E; ++I) {
-    if (isa<PathDiagnosticEventPiece>(*I))
+  for (auto &P : subPieces) {
+    if (isa<PathDiagnosticEventPiece>(*P))
       return true;
-    if (PathDiagnosticMacroPiece *MP = dyn_cast<PathDiagnosticMacroPiece>(*I))
+    if (auto *MP = dyn_cast<PathDiagnosticMacroPiece>(P.get()))
       if (MP->containsEvent())
         return true;
   }
@@ -64,33 +63,27 @@ PathDiagnosticNotePiece::~PathDiagnosticNotePiece() {}
 
 void PathPieces::flattenTo(PathPieces &Primary, PathPieces &Current,
                            bool ShouldFlattenMacros) const {
-  for (PathPieces::const_iterator I = begin(), E = end(); I != E; ++I) {
-    PathDiagnosticPiece *Piece = I->get();
-
+  for (auto &Piece : *this) {
     switch (Piece->getKind()) {
     case PathDiagnosticPiece::Call: {
-      PathDiagnosticCallPiece *Call = cast<PathDiagnosticCallPiece>(Piece);
-      IntrusiveRefCntPtr<PathDiagnosticEventPiece> CallEnter =
-        Call->getCallEnterEvent();
-      if (CallEnter)
-        Current.push_back(CallEnter);
-      Call->path.flattenTo(Primary, Primary, ShouldFlattenMacros);
-      IntrusiveRefCntPtr<PathDiagnosticEventPiece> callExit =
-        Call->getCallExitEvent();
-      if (callExit)
-        Current.push_back(callExit);
+      auto &Call = cast<PathDiagnosticCallPiece>(*Piece);
+      if (auto CallEnter = Call.getCallEnterEvent())
+        Current.push_back(std::move(CallEnter));
+      Call.path.flattenTo(Primary, Primary, ShouldFlattenMacros);
+      if (auto callExit = Call.getCallExitEvent())
+        Current.push_back(std::move(callExit));
       break;
     }
     case PathDiagnosticPiece::Macro: {
-      PathDiagnosticMacroPiece *Macro = cast<PathDiagnosticMacroPiece>(Piece);
+      auto &Macro = cast<PathDiagnosticMacroPiece>(*Piece);
       if (ShouldFlattenMacros) {
-        Macro->subPieces.flattenTo(Primary, Primary, ShouldFlattenMacros);
+        Macro.subPieces.flattenTo(Primary, Primary, ShouldFlattenMacros);
       } else {
         Current.push_back(Piece);
         PathPieces NewPath;
-        Macro->subPieces.flattenTo(Primary, NewPath, ShouldFlattenMacros);
+        Macro.subPieces.flattenTo(Primary, NewPath, ShouldFlattenMacros);
         // FIXME: This probably shouldn't mutate the original path piece.
-        Macro->subPieces = NewPath;
+        Macro.subPieces = NewPath;
       }
       break;
     }
@@ -143,7 +136,7 @@ getFirstStackedCallToHeaderFile(PathDiagnosticCallPiece *CP,
   // Check if the last piece in the callee path is a call to a function outside
   // of the main file.
   if (PathDiagnosticCallPiece *CPInner =
-      dyn_cast<PathDiagnosticCallPiece>(Path.back())) {
+          dyn_cast<PathDiagnosticCallPiece>(Path.back().get())) {
     return getFirstStackedCallToHeaderFile(CPInner, SMgr);
   }
 
@@ -890,24 +883,26 @@ void PathDiagnosticLocation::flatten() {
 // Manipulation of PathDiagnosticCallPieces.
 //===----------------------------------------------------------------------===//
 
-PathDiagnosticCallPiece *
-PathDiagnosticCallPiece::construct(const ExplodedNode *N,
-                                   const CallExitEnd &CE,
+std::shared_ptr<PathDiagnosticCallPiece>
+PathDiagnosticCallPiece::construct(const ExplodedNode *N, const CallExitEnd &CE,
                                    const SourceManager &SM) {
   const Decl *caller = CE.getLocationContext()->getDecl();
   PathDiagnosticLocation pos = getLocationForCaller(CE.getCalleeContext(),
                                                     CE.getLocationContext(),
                                                     SM);
-  return new PathDiagnosticCallPiece(caller, pos);
+  return std::shared_ptr<PathDiagnosticCallPiece>(
+      new PathDiagnosticCallPiece(caller, pos));
 }
 
 PathDiagnosticCallPiece *
 PathDiagnosticCallPiece::construct(PathPieces &path,
                                    const Decl *caller) {
-  PathDiagnosticCallPiece *C = new PathDiagnosticCallPiece(path, caller);
+  std::shared_ptr<PathDiagnosticCallPiece> C(
+      new PathDiagnosticCallPiece(path, caller));
   path.clear();
-  path.push_front(C);
-  return C;
+  auto *R = C.get();
+  path.push_front(std::move(C));
+  return R;
 }
 
 void PathDiagnosticCallPiece::setCallee(const CallEnter &CE,
@@ -989,7 +984,7 @@ static bool describeCodeDecl(raw_ostream &Out, const Decl *D,
   return true;
 }
 
-IntrusiveRefCntPtr<PathDiagnosticEventPiece>
+std::shared_ptr<PathDiagnosticEventPiece>
 PathDiagnosticCallPiece::getCallEnterEvent() const {
   if (!Callee)
     return nullptr;
@@ -1001,10 +996,10 @@ PathDiagnosticCallPiece::getCallEnterEvent() const {
   describeCodeDecl(Out, Callee, /*ExtendedDescription=*/true);
 
   assert(callEnter.asLocation().isValid());
-  return new PathDiagnosticEventPiece(callEnter, Out.str());
+  return std::make_shared<PathDiagnosticEventPiece>(callEnter, Out.str());
 }
 
-IntrusiveRefCntPtr<PathDiagnosticEventPiece>
+std::shared_ptr<PathDiagnosticEventPiece>
 PathDiagnosticCallPiece::getCallEnterWithinCallerEvent() const {
   if (!callEnterWithin.asLocation().isValid())
     return nullptr;
@@ -1020,10 +1015,10 @@ PathDiagnosticCallPiece::getCallEnterWithinCallerEvent() const {
   Out << "Entered call";
   describeCodeDecl(Out, Caller, /*ExtendedDescription=*/false, " from ");
 
-  return new PathDiagnosticEventPiece(callEnterWithin, Out.str());
+  return std::make_shared<PathDiagnosticEventPiece>(callEnterWithin, Out.str());
 }
 
-IntrusiveRefCntPtr<PathDiagnosticEventPiece>
+std::shared_ptr<PathDiagnosticEventPiece>
 PathDiagnosticCallPiece::getCallExitEvent() const {
   if (NoExit)
     return nullptr;
@@ -1042,7 +1037,7 @@ PathDiagnosticCallPiece::getCallExitEvent() const {
   }
 
   assert(callReturn.asLocation().isValid());
-  return new PathDiagnosticEventPiece(callReturn, Out.str());
+  return std::make_shared<PathDiagnosticEventPiece>(callReturn, Out.str());
 }
 
 static void compute_path_size(const PathPieces &pieces, unsigned &size) {
diff --git a/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp b/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
index c5263ee0e5ca..66812ed8ff5b 100644
--- a/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
+++ b/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp
@@ -208,19 +208,14 @@ static void ReportCall(raw_ostream &o,
                        unsigned indent,
                        unsigned depth) {
 
-  IntrusiveRefCntPtr<PathDiagnosticEventPiece> callEnter =
-    P.getCallEnterEvent();
-
-  if (callEnter)
+  if (auto callEnter = P.getCallEnterEvent())
     ReportPiece(o, *callEnter, FM, SM, LangOpts, indent, depth, true,
                 P.isLastInMainSourceFile());
 
-  IntrusiveRefCntPtr<PathDiagnosticEventPiece> callEnterWithinCaller =
-    P.getCallEnterWithinCallerEvent();
 
   ++depth;
 
-  if (callEnterWithinCaller)
+  if (auto callEnterWithinCaller = P.getCallEnterWithinCallerEvent())
     ReportPiece(o, *callEnterWithinCaller, FM, SM, LangOpts,
                 indent, depth, true);
 
@@ -229,10 +224,7 @@ static void ReportCall(raw_ostream &o,
 
   --depth;
 
-  IntrusiveRefCntPtr<PathDiagnosticEventPiece> callExit =
-    P.getCallExitEvent();
-
-  if (callExit)
+  if (auto callExit = P.getCallExitEvent())
     ReportPiece(o, *callExit, FM, SM, LangOpts, indent, depth, true);
 }
 
@@ -299,10 +291,9 @@ void PlistDiagnostics::FlushDiagnosticsImpl(
   if (!Diags.empty())
     SM = &Diags.front()->path.front()->getLocation().getManager();
 
-
-  auto AddPieceFID = [&FM, &Fids, SM](const PathDiagnosticPiece *Piece)->void {
-    AddFID(FM, Fids, *SM, Piece->getLocation().asLocation());
-    ArrayRef<SourceRange> Ranges = Piece->getRanges();
+  auto AddPieceFID = [&FM, &Fids, SM](const PathDiagnosticPiece &Piece) {
+    AddFID(FM, Fids, *SM, Piece.getLocation().asLocation());
+    ArrayRef<SourceRange> Ranges = Piece.getRanges();
     for (const SourceRange &Range : Ranges) {
       AddFID(FM, Fids, *SM, Range.getBegin());
       AddFID(FM, Fids, *SM, Range.getEnd());
@@ -318,23 +309,20 @@ void PlistDiagnostics::FlushDiagnosticsImpl(
       const PathPieces &Path = *WorkList.pop_back_val();
 
       for (const auto &Iter : Path) {
-        const PathDiagnosticPiece *Piece = Iter.get();
+        const PathDiagnosticPiece &Piece = *Iter;
         AddPieceFID(Piece);
 
         if (const PathDiagnosticCallPiece *Call =
-            dyn_cast<PathDiagnosticCallPiece>(Piece)) {
-          if (IntrusiveRefCntPtr<PathDiagnosticEventPiece>
-              CallEnterWithin = Call->getCallEnterWithinCallerEvent())
-            AddPieceFID(CallEnterWithin.get());
+                dyn_cast<PathDiagnosticCallPiece>(&Piece)) {
+          if (auto CallEnterWithin = Call->getCallEnterWithinCallerEvent())
+            AddPieceFID(*CallEnterWithin);
 
-          if (IntrusiveRefCntPtr<PathDiagnosticEventPiece>
-              CallEnterEvent = Call->getCallEnterEvent())
-            AddPieceFID(CallEnterEvent.get());
+          if (auto CallEnterEvent = Call->getCallEnterEvent())
+            AddPieceFID(*CallEnterEvent);
 
           WorkList.push_back(&Call->path);
-        }
-        else if (const PathDiagnosticMacroPiece *Macro =
-                 dyn_cast<PathDiagnosticMacroPiece>(Piece)) {
+        } else if (const PathDiagnosticMacroPiece *Macro =
+                       dyn_cast<PathDiagnosticMacroPiece>(&Piece)) {
           WorkList.push_back(&Macro->subPieces);
         }
       }
diff --git a/lib/StaticAnalyzer/Frontend/CheckerRegistration.cpp b/lib/StaticAnalyzer/Frontend/CheckerRegistration.cpp
index 31b6638e651f..6792f89876cd 100644
--- a/lib/StaticAnalyzer/Frontend/CheckerRegistration.cpp
+++ b/lib/StaticAnalyzer/Frontend/CheckerRegistration.cpp
@@ -116,7 +116,7 @@ ento::createCheckerManager(AnalyzerOptions &opts, const LangOptions &langOpts,
                            ArrayRef<std::string> plugins,
                            DiagnosticsEngine &diags) {
   std::unique_ptr<CheckerManager> checkerMgr(
-      new CheckerManager(langOpts, &opts));
+      new CheckerManager(langOpts, opts));
 
   SmallVector<CheckerOptInfo, 8> checkerOpts = getCheckerOptList(opts);
 
diff --git a/lib/StaticAnalyzer/Frontend/ModelInjector.cpp b/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
index 0a284851b08d..c6f3baa7e3b2 100644
--- a/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
+++ b/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
@@ -62,8 +62,7 @@ void ModelInjector::onBodySynthesis(const NamedDecl *D) {
     return;
   }
 
-  IntrusiveRefCntPtr<CompilerInvocation> Invocation(
-      new CompilerInvocation(CI.getInvocation()));
+  auto Invocation = std::make_shared<CompilerInvocation>(CI.getInvocation());
 
   FrontendOptions &FrontendOpts = Invocation->getFrontendOpts();
   InputKind IK = IK_CXX; // FIXME
@@ -76,7 +75,7 @@ void ModelInjector::onBodySynthesis(const NamedDecl *D) {
   // Modules are parsed by a separate CompilerInstance, so this code mimics that
   // behavior for models
   CompilerInstance Instance(CI.getPCHContainerOperations());
-  Instance.setInvocation(&*Invocation);
+  Instance.setInvocation(std::move(Invocation));
   Instance.createDiagnostics(
       new ForwardingDiagnosticConsumer(CI.getDiagnosticClient()),
       /*ShouldOwnClient=*/true);
@@ -89,7 +88,7 @@ void ModelInjector::onBodySynthesis(const NamedDecl *D) {
   // is set to true to avoid double free issues
   Instance.setFileManager(&CI.getFileManager());
   Instance.setSourceManager(&SM);
-  Instance.setPreprocessor(&CI.getPreprocessor());
+  Instance.setPreprocessor(CI.getPreprocessorPtr());
   Instance.setASTContext(&CI.getASTContext());
 
   Instance.getPreprocessor().InitializeForModelFile();
diff --git a/lib/Tooling/Tooling.cpp b/lib/Tooling/Tooling.cpp
index 529c47ef1e7a..25cee98078f3 100644
--- a/lib/Tooling/Tooling.cpp
+++ b/lib/Tooling/Tooling.cpp
@@ -275,13 +275,13 @@ bool ToolInvocation::run() {
     Invocation->getPreprocessorOpts().addRemappedFile(It.getKey(),
                                                       Input.release());
   }
-  return runInvocation(BinaryName, Compilation.get(), Invocation.release(),
+  return runInvocation(BinaryName, Compilation.get(), std::move(Invocation),
                        std::move(PCHContainerOps));
 }
 
 bool ToolInvocation::runInvocation(
     const char *BinaryName, clang::driver::Compilation *Compilation,
-    clang::CompilerInvocation *Invocation,
+    std::shared_ptr<clang::CompilerInvocation> Invocation,
     std::shared_ptr<PCHContainerOperations> PCHContainerOps) {
   // Show the invocation, with -v.
   if (Invocation->getHeaderSearchOpts().Verbose) {
@@ -290,17 +290,17 @@ bool ToolInvocation::runInvocation(
     llvm::errs() << "\n";
   }
 
-  return Action->runInvocation(Invocation, Files, std::move(PCHContainerOps),
-                               DiagConsumer);
+  return Action->runInvocation(std::move(Invocation), Files,
+                               std::move(PCHContainerOps), DiagConsumer);
 }
 
 bool FrontendActionFactory::runInvocation(
-    CompilerInvocation *Invocation, FileManager *Files,
+    std::shared_ptr<CompilerInvocation> Invocation, FileManager *Files,
     std::shared_ptr<PCHContainerOperations> PCHContainerOps,
     DiagnosticConsumer *DiagConsumer) {
   // Create a compiler instance to handle the actual work.
   clang::CompilerInstance Compiler(std::move(PCHContainerOps));
-  Compiler.setInvocation(Invocation);
+  Compiler.setInvocation(std::move(Invocation));
   Compiler.setFileManager(Files);
 
   // The FrontendAction can have lifetime requirements for Compiler or its
@@ -474,7 +474,8 @@ class ASTBuilderAction : public ToolAction {
 public:
   ASTBuilderAction(std::vector<std::unique_ptr<ASTUnit>> &ASTs) : ASTs(ASTs) {}
 
-  bool runInvocation(CompilerInvocation *Invocation, FileManager *Files,
+  bool runInvocation(std::shared_ptr<CompilerInvocation> Invocation,
+                     FileManager *Files,
                      std::shared_ptr<PCHContainerOperations> PCHContainerOps,
                      DiagnosticConsumer *DiagConsumer) override {
     std::unique_ptr<ASTUnit> AST = ASTUnit::LoadFromCompilerInvocation(
diff --git a/test/CodeGen/builtins-ppc-error.c b/test/CodeGen/builtins-ppc-error.c
new file mode 100644
index 000000000000..5860c4f9e77e
--- /dev/null
+++ b/test/CodeGen/builtins-ppc-error.c
@@ -0,0 +1,20 @@
+// REQUIRES: powerpc-registered-target
+
+// RUN: %clang_cc1 -faltivec -target-feature +power9-vector \
+// RUN:   -triple powerpc64-unknown-unknown -fsyntax-only   \
+// RUN: -Wall -Werror -verify %s
+
+// RUN: %clang_cc1 -faltivec -target-feature +power9-vector  \
+// RUN: -triple powerpc64le-unknown-unknown -fsyntax-only    \
+// RUN: -Wall -Werror -verify %s
+
+#include <altivec.h>
+
+extern vector signed int vsi;
+extern vector unsigned char vuc;
+
+void testInsertWord1(void) {
+  int index = 5;
+  vector unsigned char v1 = vec_insert4b(vsi, vuc, index); // expected-error {{argument to '__builtin_vsx_insertword' must be a constant integer}}
+  vector unsigned long long v2 = vec_extract4b(vuc, index);   // expected-error {{argument to '__builtin_vsx_extractuword' must be a constant integer}}
+}
diff --git a/test/CodeGen/builtins-ppc-p9vector.c b/test/CodeGen/builtins-ppc-p9vector.c
index f70d2f9f1504..bd0ad182f15f 100644
--- a/test/CodeGen/builtins-ppc-p9vector.c
+++ b/test/CodeGen/builtins-ppc-p9vector.c
@@ -1166,17 +1166,52 @@ vector float test114(void) {
 // CHECK-BE: shufflevector <8 x i16> {{.+}}, <8 x i16> {{.+}}, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
 // CHECK-BE: @llvm.ppc.vsx.xvcvhpsp(<8 x i16> {{.+}})
 // CHECK-BE-NEXT: ret <4 x float>
-// CHECK-LE: shufflevector <8 x i16> {{.+}}, <8 x i16> {{.+}}, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
-// CHECK-LE: @llvm.ppc.vsx.xvcvhpsp(<8 x i16> {{.+}})
-// CHECK-LE-NEXT: ret <4 x float>
+// CHECK: shufflevector <8 x i16> {{.+}}, <8 x i16> {{.+}}, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
+// CHECK: @llvm.ppc.vsx.xvcvhpsp(<8 x i16> {{.+}})
+// CHECK-NEXT: ret <4 x float>
   return vec_extract_fp32_from_shorth(vusa);
 }
 vector float test115(void) {
 // CHECK-BE: shufflevector <8 x i16> {{.+}}, <8 x i16> {{.+}}, <8 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
 // CHECK-BE: @llvm.ppc.vsx.xvcvhpsp(<8 x i16> {{.+}})
 // CHECK-BE-NEXT: ret <4 x float>
-// CHECK-LE: shufflevector <8 x i16> {{.+}}, <8 x i16> {{.+}}, <8 x i32> <i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
-// CHECK-LE: @llvm.ppc.vsx.xvcvhpsp(<8 x i16> {{.+}})
-// CHECK-LE-NEXT: ret <4 x float>
+// CHECK: shufflevector <8 x i16> {{.+}}, <8 x i16> {{.+}}, <8 x i32> <i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
+// CHECK: @llvm.ppc.vsx.xvcvhpsp(<8 x i16> {{.+}})
+// CHECK-NEXT: ret <4 x float>
   return vec_extract_fp32_from_shortl(vusa);
 }
+vector unsigned char test116(void) {
+// CHECK-BE: [[T1:%.+]] = call <4 x i32> @llvm.ppc.vsx.xxinsertw(<4 x i32> {{.+}}, <2 x i64> {{.+}}, i32 7)
+// CHECK-BE-NEXT: bitcast <4 x i32> [[T1]] to <16 x i8>
+// CHECK: [[T1:%.+]] = shufflevector <2 x i64> {{.+}}, <2 x i64> {{.+}}, <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT: [[T2:%.+]] =  bitcast <2 x i64> [[T1]] to <4 x i32>
+// CHECK-NEXT: [[T3:%.+]] = call <4 x i32> @llvm.ppc.vsx.xxinsertw(<4 x i32> [[T2]], <2 x i64> {{.+}}, i32 5)
+// CHECK-NEXT: bitcast <4 x i32> [[T3]] to <16 x i8>
+  return vec_insert4b(vuia, vuca, 7);
+}
+vector unsigned char test117(void) {
+// CHECK-BE: [[T1:%.+]] = call <4 x i32> @llvm.ppc.vsx.xxinsertw(<4 x i32> {{.+}}, <2 x i64> {{.+}}, i32 12)
+// CHECK-BE-NEXT: bitcast <4 x i32> [[T1]] to <16 x i8>
+// CHECK: [[T1:%.+]] = shufflevector <2 x i64> {{.+}}, <2 x i64> {{.+}}, <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT: [[T2:%.+]] =  bitcast <2 x i64> [[T1]] to <4 x i32>
+// CHECK-NEXT: [[T3:%.+]] = call <4 x i32> @llvm.ppc.vsx.xxinsertw(<4 x i32> [[T2]], <2 x i64> {{.+}}, i32 0)
+// CHECK-NEXT: bitcast <4 x i32> [[T3]] to <16 x i8>
+  return vec_insert4b(vuia, vuca, 13);
+}
+vector unsigned long long test118(void) {
+// CHECK-BE: call <2 x i64> @llvm.ppc.vsx.xxextractuw(<2 x i64> {{.+}}, i32 11)
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: [[T1:%.+]] = call <2 x i64> @llvm.ppc.vsx.xxextractuw(<2 x i64> {{.+}}, i32 1)
+// CHECK-NEXT: shufflevector <2 x i64> [[T1]], <2 x i64> [[T1]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_extract4b(vuca, 11);
+}
+vector unsigned long long test119(void) {
+// CHECK-BE: call <2 x i64> @llvm.ppc.vsx.xxextractuw(<2 x i64> {{.+}}, i32 0)
+// CHECK-BE-NEXT: ret <2 x i64>
+// CHECK: [[T1:%.+]] = call <2 x i64> @llvm.ppc.vsx.xxextractuw(<2 x i64> {{.+}}, i32 12)
+// CHECK-NEXT: shufflevector <2 x i64> [[T1]], <2 x i64> [[T1]], <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT: ret <2 x i64>
+  return vec_extract4b(vuca, -5);
+}
+
diff --git a/test/CodeGen/catch-undef-behavior.c b/test/CodeGen/catch-undef-behavior.c
index c2f01ae1a66f..d7a26f8a7d4b 100644
--- a/test/CodeGen/catch-undef-behavior.c
+++ b/test/CodeGen/catch-undef-behavior.c
@@ -6,16 +6,16 @@
 // CHECK-UBSAN: @[[INT:.*]] = private unnamed_addr constant { i16, i16, [6 x i8] } { i16 0, i16 11, [6 x i8] c"'int'\00" }
 
 // FIXME: When we only emit each type once, use [[INT]] more below.
-// CHECK-UBSAN: @[[LINE_100:.*]] = private unnamed_addr global {{.*}}, i32 100, i32 5 {{.*}} @[[INT]], i64 4, i8 1
-// CHECK-UBSAN: @[[LINE_200:.*]] = {{.*}}, i32 200, i32 10 {{.*}}, i64 4, i8 0
+// CHECK-UBSAN: @[[LINE_100:.*]] = private unnamed_addr global {{.*}}, i32 100, i32 5 {{.*}} @[[INT]], i8 2, i8 1
+// CHECK-UBSAN: @[[LINE_200:.*]] = {{.*}}, i32 200, i32 10 {{.*}}, i8 2, i8 0
 // CHECK-UBSAN: @[[LINE_300:.*]] = {{.*}}, i32 300, i32 12 {{.*}} @{{.*}}, {{.*}} @{{.*}}
 // CHECK-UBSAN: @[[LINE_400:.*]] = {{.*}}, i32 400, i32 12 {{.*}} @{{.*}}, {{.*}} @{{.*}}
-// CHECK-UBSAN: @[[LINE_500:.*]] = {{.*}}, i32 500, i32 10 {{.*}} @{{.*}}, i64 4, i8 0 }
-// CHECK-UBSAN: @[[LINE_600:.*]] = {{.*}}, i32 600, i32 3 {{.*}} @{{.*}}, i64 4, i8 1 }
+// CHECK-UBSAN: @[[LINE_500:.*]] = {{.*}}, i32 500, i32 10 {{.*}} @{{.*}}, i8 2, i8 0 }
+// CHECK-UBSAN: @[[LINE_600:.*]] = {{.*}}, i32 600, i32 3 {{.*}} @{{.*}}, i8 2, i8 1 }
 
 // CHECK-UBSAN: @[[STRUCT_S:.*]] = private unnamed_addr constant { i16, i16, [11 x i8] } { i16 -1, i16 0, [11 x i8] c"'struct S'\00" }
 
-// CHECK-UBSAN: @[[LINE_700:.*]] = {{.*}}, i32 700, i32 14 {{.*}} @[[STRUCT_S]], i64 4, i8 3 }
+// CHECK-UBSAN: @[[LINE_700:.*]] = {{.*}}, i32 700, i32 14 {{.*}} @[[STRUCT_S]], i8 2, i8 3 }
 // CHECK-UBSAN: @[[LINE_800:.*]] = {{.*}}, i32 800, i32 12 {{.*}} @{{.*}} }
 // CHECK-UBSAN: @[[LINE_900:.*]] = {{.*}}, i32 900, i32 11 {{.*}} @{{.*}} }
 // CHECK-UBSAN: @[[LINE_1000:.*]] = {{.*}}, i32 1000, i32 10 {{.*}} @{{.*}} }
@@ -54,7 +54,7 @@ void foo() {
   // CHECK-TRAP:  br i1 %[[OK]], {{.*}}
 
   // CHECK-UBSAN:      %[[ARG:.*]] = ptrtoint {{.*}} %[[PTR]] to i64
-  // CHECK-UBSAN-NEXT: call void @__ubsan_handle_type_mismatch(i8* bitcast ({{.*}} @[[LINE_100]] to i8*), i64 %[[ARG]])
+  // CHECK-UBSAN-NEXT: call void @__ubsan_handle_type_mismatch_v1(i8* bitcast ({{.*}} @[[LINE_100]] to i8*), i64 %[[ARG]])
 
   // CHECK-TRAP:      call void @llvm.trap() [[NR_NUW:#[0-9]+]]
   // CHECK-TRAP-NEXT: unreachable
@@ -62,7 +62,7 @@ void foo() {
   // With -fsanitize=null, only perform the null check.
   // CHECK-NULL: %[[NULL:.*]] = icmp ne {{.*}}, null
   // CHECK-NULL: br i1 %[[NULL]]
-  // CHECK-NULL: call void @__ubsan_handle_type_mismatch(i8* bitcast ({{.*}} @[[LINE_100]] to i8*), i64 %{{.*}})
+  // CHECK-NULL: call void @__ubsan_handle_type_mismatch_v1(i8* bitcast ({{.*}} @[[LINE_100]] to i8*), i64 %{{.*}})
 #line 100
   u.i=1;
 }
@@ -77,7 +77,7 @@ int bar(int *a) {
   // CHECK-COMMON-NEXT: icmp eq i64 %[[MISALIGN]], 0
 
   // CHECK-UBSAN:      %[[ARG:.*]] = ptrtoint
-  // CHECK-UBSAN-NEXT: call void @__ubsan_handle_type_mismatch(i8* bitcast ({{.*}} @[[LINE_200]] to i8*), i64 %[[ARG]])
+  // CHECK-UBSAN-NEXT: call void @__ubsan_handle_type_mismatch_v1(i8* bitcast ({{.*}} @[[LINE_200]] to i8*), i64 %[[ARG]])
 
   // CHECK-TRAP:      call void @llvm.trap() [[NR_NUW]]
   // CHECK-TRAP-NEXT: unreachable
@@ -145,7 +145,7 @@ int rsh_inbounds(int a, int b) {
 
 // CHECK-COMMON-LABEL: @load
 int load(int *p) {
-  // CHECK-UBSAN: call void @__ubsan_handle_type_mismatch(i8* bitcast ({{.*}} @[[LINE_500]] to i8*), i64 %{{.*}})
+  // CHECK-UBSAN: call void @__ubsan_handle_type_mismatch_v1(i8* bitcast ({{.*}} @[[LINE_500]] to i8*), i64 %{{.*}})
 
   // CHECK-TRAP:      call void @llvm.trap() [[NR_NUW]]
   // CHECK-TRAP-NEXT: unreachable
@@ -155,7 +155,7 @@ int load(int *p) {
 
 // CHECK-COMMON-LABEL: @store
 void store(int *p, int q) {
-  // CHECK-UBSAN: call void @__ubsan_handle_type_mismatch(i8* bitcast ({{.*}} @[[LINE_600]] to i8*), i64 %{{.*}})
+  // CHECK-UBSAN: call void @__ubsan_handle_type_mismatch_v1(i8* bitcast ({{.*}} @[[LINE_600]] to i8*), i64 %{{.*}})
 
   // CHECK-TRAP:      call void @llvm.trap() [[NR_NUW]]
   // CHECK-TRAP-NEXT: unreachable
@@ -167,7 +167,7 @@ struct S { int k; };
 
 // CHECK-COMMON-LABEL: @member_access
 int *member_access(struct S *p) {
-  // CHECK-UBSAN: call void @__ubsan_handle_type_mismatch(i8* bitcast ({{.*}} @[[LINE_700]] to i8*), i64 %{{.*}})
+  // CHECK-UBSAN: call void @__ubsan_handle_type_mismatch_v1(i8* bitcast ({{.*}} @[[LINE_700]] to i8*), i64 %{{.*}})
 
   // CHECK-TRAP:      call void @llvm.trap() [[NR_NUW]]
   // CHECK-TRAP-NEXT: unreachable
diff --git a/test/CodeGen/sanitize-recover.c b/test/CodeGen/sanitize-recover.c
index b263f5163181..dd8734e971eb 100644
--- a/test/CodeGen/sanitize-recover.c
+++ b/test/CodeGen/sanitize-recover.c
@@ -33,7 +33,7 @@ void foo() {
   // PARTIAL:      br i1 %[[CHECK012]], {{.*}} !prof ![[WEIGHT_MD:.*]], !nosanitize
 
   // PARTIAL:      br i1 %[[CHECK02]], {{.*}}
-  // PARTIAL:      call void @__ubsan_handle_type_mismatch_abort(
+  // PARTIAL:      call void @__ubsan_handle_type_mismatch_v1_abort(
   // PARTIAL-NEXT: unreachable
-  // PARTIAL:      call void @__ubsan_handle_type_mismatch(
+  // PARTIAL:      call void @__ubsan_handle_type_mismatch_v1(
 }
diff --git a/test/CodeGen/vectorcall.c b/test/CodeGen/vectorcall.c
index b38d5e5fbc5b..167f72ca2cfd 100644
--- a/test/CodeGen/vectorcall.c
+++ b/test/CodeGen/vectorcall.c
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s
-// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -ffreestanding -triple=i386-pc-win32       | FileCheck %s --check-prefix=X32
+// RUN: %clang_cc1 -emit-llvm %s -o - -ffreestanding -triple=x86_64-pc-win32     | FileCheck %s --check-prefix=X64
 
 void __vectorcall v1(int a, int b) {}
-// CHECK: define x86_vectorcallcc void @"\01v1@@8"(i32 inreg %a, i32 inreg %b)
+// X32: define x86_vectorcallcc void @"\01v1@@8"(i32 inreg %a, i32 inreg %b)
 // X64: define x86_vectorcallcc void @"\01v1@@16"(i32 %a, i32 %b)
 
 void __vectorcall v2(char a, char b) {}
-// CHECK: define x86_vectorcallcc void @"\01v2@@8"(i8 inreg signext %a, i8 inreg signext %b)
+// X32: define x86_vectorcallcc void @"\01v2@@8"(i8 inreg signext %a, i8 inreg signext %b)
 // X64: define x86_vectorcallcc void @"\01v2@@16"(i8 %a, i8 %b)
 
 struct Small { int x; };
 void __vectorcall v3(int a, struct Small b, int c) {}
-// CHECK: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, i32 %b.0, i32 inreg %c)
+// X32: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, i32 %b.0, i32 inreg %c)
 // X64: define x86_vectorcallcc void @"\01v3@@24"(i32 %a, i32 %b.coerce, i32 %c)
 
 struct Large { int a[5]; };
 void __vectorcall v4(int a, struct Large b, int c) {}
-// CHECK: define x86_vectorcallcc void @"\01v4@@28"(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c)
+// X32: define x86_vectorcallcc void @"\01v4@@28"(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c)
 // X64: define x86_vectorcallcc void @"\01v4@@40"(i32 %a, %struct.Large* %b, i32 %c)
 
 struct HFA2 { double x, y; };
@@ -24,54 +24,84 @@ struct HFA4 { double w, x, y, z; };
 struct HFA5 { double v, w, x, y, z; };
 
 void __vectorcall hfa1(int a, struct HFA4 b, int c) {}
-// CHECK: define x86_vectorcallcc void @"\01hfa1@@40"(i32 inreg %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg %c)
-// X64: define x86_vectorcallcc void @"\01hfa1@@48"(i32 %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 %c)
+// X32: define x86_vectorcallcc void @"\01hfa1@@40"(i32 inreg %a, %struct.HFA4 inreg %b.coerce, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01hfa1@@48"(i32 %a, %struct.HFA4 inreg %b.coerce, i32 %c)
 
 // HFAs that would require more than six total SSE registers are passed
 // indirectly. Additional vector arguments can consume the rest of the SSE
 // registers.
 void __vectorcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
-// CHECK: define x86_vectorcallcc void @"\01hfa2@@72"(double %a.0, double %a.1, double %a.2, double %a.3, %struct.HFA4* inreg %b, double %c)
-// X64: define x86_vectorcallcc void @"\01hfa2@@72"(double %a.0, double %a.1, double %a.2, double %a.3, %struct.HFA4* %b, double %c)
+// X32: define x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, %struct.HFA4* inreg %b, double %c)
+// X64: define x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, %struct.HFA4* %b, double %c)
 
 // Ensure that we pass builtin types directly while counting them against the
 // SSE register usage.
 void __vectorcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {}
-// CHECK: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* inreg %f)
+// X32: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* inreg %f)
 // X64: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* %f)
 
 // Aggregates with more than four elements are not HFAs and are passed byval.
 // Because they are not classified as homogeneous, they don't get special
 // handling to ensure alignment.
 void __vectorcall hfa4(struct HFA5 a) {}
-// CHECK: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* byval align 4)
+// X32: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* byval align 4)
 // X64: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* %a)
 
 // Return HFAs of 4 or fewer elements in registers.
 static struct HFA2 g_hfa2;
 struct HFA2 __vectorcall hfa5(void) { return g_hfa2; }
-// CHECK: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
+// X32: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
 // X64: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
 
 typedef float __attribute__((vector_size(16))) v4f32;
 struct HVA2 { v4f32 x, y; };
+struct HVA3 { v4f32 w, x, y; };
 struct HVA4 { v4f32 w, x, y, z; };
+struct HVA5 { v4f32 w, x, y, z, p; };
 
-void __vectorcall hva1(int a, struct HVA4 b, int c) {}
-// CHECK: define x86_vectorcallcc void @"\01hva1@@72"(i32 inreg %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg %c)
-// X64: define x86_vectorcallcc void @"\01hva1@@80"(i32 %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 %c)
+v4f32 __vectorcall hva1(int a, struct HVA4 b, int c) {return b.w;}
+// X32: define x86_vectorcallcc <4 x float> @"\01hva1@@72"(i32 inreg %a, %struct.HVA4 inreg %b.coerce, i32 inreg %c)
+// X64: define x86_vectorcallcc <4 x float> @"\01hva1@@80"(i32 %a, %struct.HVA4 inreg %b.coerce, i32 %c)
 
-void __vectorcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {}
-// CHECK: define x86_vectorcallcc void @"\01hva2@@144"(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, %struct.HVA4* inreg %b, <4 x float> %c)
-// X64: define x86_vectorcallcc void @"\01hva2@@144"(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, %struct.HVA4* %b, <4 x float> %c)
+v4f32 __vectorcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {return c;}
+// X32: define x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* inreg %b, <4 x float> %c)
+// X64: define x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* %b, <4 x float> %c)
 
-void __vectorcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {}
-// CHECK: define x86_vectorcallcc void @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* inreg %f)
-// X64: define x86_vectorcallcc void @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* %f)
+v4f32 __vectorcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {return f.x;}
+// X32: define x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* inreg %f)
+// X64: define x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* %f)
+
+// vector types have higher priority then HVA structures, So vector types are allocated first
+// and HVAs are allocated if enough registers are available
+v4f32 __vectorcall hva4(struct HVA4 a, struct HVA2 b, v4f32 c) {return b.y;}
+// X32: define x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, %struct.HVA2* inreg %b, <4 x float> %c)
+// X64: define x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, %struct.HVA2* %b, <4 x float> %c)
+
+v4f32 __vectorcall hva5(struct HVA3 a, struct HVA3 b, v4f32 c, struct HVA2 d) {return d.y;}
+// X32: define x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, %struct.HVA3* inreg %b, <4 x float> %c, %struct.HVA2 inreg %d.coerce)
+// X64: define x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, %struct.HVA3* %b, <4 x float> %c, %struct.HVA2 inreg %d.coerce)
+
+struct HVA4 __vectorcall hva6(struct HVA4 a, struct HVA4 b) { return b;}
+// X32: define x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* inreg %b)
+// X64: define x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* %b)
+
+struct HVA5 __vectorcall hva7() {struct HVA5 a = {}; return a;}
+// X32: define x86_vectorcallcc void @"\01hva7@@0"(%struct.HVA5* inreg noalias sret %agg.result)
+// X64: define x86_vectorcallcc void @"\01hva7@@0"(%struct.HVA5* noalias sret %agg.result)
+
+v4f32 __vectorcall hva8(v4f32 a, v4f32 b, v4f32 c, v4f32 d, int e, v4f32 f) {return f;}
+// X32: define x86_vectorcallcc <4 x float> @"\01hva8@@84"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 inreg %e, <4 x float> %f)
+// X64: define x86_vectorcallcc <4 x float> @"\01hva8@@88"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 %e, <4 x float> %f)
 
 typedef float __attribute__((ext_vector_type(3))) v3f32;
 struct OddSizeHVA { v3f32 x, y; };
 
 void __vectorcall odd_size_hva(struct OddSizeHVA a) {}
-// CHECK: define x86_vectorcallcc void @"\01odd_size_hva@@32"(<3 x float> %a.0, <3 x float> %a.1)
-// X64: define x86_vectorcallcc void @"\01odd_size_hva@@32"(<3 x float> %a.0, <3 x float> %a.1)
+// X32: define x86_vectorcallcc void @"\01odd_size_hva@@32"(%struct.OddSizeHVA inreg %a.coerce)
+// X64: define x86_vectorcallcc void @"\01odd_size_hva@@32"(%struct.OddSizeHVA inreg %a.coerce)
+
+// The Vectorcall ABI only allows passing the first 6 items in registers, so this shouldn't 
+// consider 'p7' as a register.  Instead p5 gets put into the register on the second pass.
+struct HFA2 __vectorcall AddParticles(struct HFA2 p1, float p2, struct HFA4 p3, int p4, struct HFA2 p5, float p6, float p7){ return p1;}
+// X32: define x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@80"(%struct.HFA2 inreg %p1.coerce, float %p2, %struct.HFA4* inreg %p3, i32 inreg %p4, %struct.HFA2 inreg %p5.coerce, float %p6, float %p7)
+// X64: define x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@96"(%struct.HFA2 inreg %p1.coerce, float %p2, %struct.HFA4* %p3, i32 %p4, %struct.HFA2 inreg %p5.coerce, float %p6, float %p7)
diff --git a/test/CodeGenCXX/dllexport.cpp b/test/CodeGenCXX/dllexport.cpp
index eb9ca79b7b40..116176e2cb92 100644
--- a/test/CodeGenCXX/dllexport.cpp
+++ b/test/CodeGenCXX/dllexport.cpp
@@ -515,6 +515,18 @@ struct __declspec(dllexport) ClassWithClosure {
 // M32-DAG:   ret void
 };
 
+template <typename T> struct TemplateWithClosure {
+  TemplateWithClosure(int x = sizeof(T)) {}
+};
+extern template struct TemplateWithClosure<char>;
+template struct __declspec(dllexport) TemplateWithClosure<char>;
+extern template struct TemplateWithClosure<int>;
+template struct __declspec(dllexport) TemplateWithClosure<int>;
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_F?$TemplateWithClosure@D@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// M32-DAG:   call {{.*}} @"\01??0?$TemplateWithClosure@D@@QAE@H@Z"({{.*}}, i32 1)
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_F?$TemplateWithClosure@H@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// M32-DAG:   call {{.*}} @"\01??0?$TemplateWithClosure@H@@QAE@H@Z"({{.*}}, i32 4)
+
 struct __declspec(dllexport) NestedOuter {
   DELETE_IMPLICIT_MEMBERS(NestedOuter);
   NestedOuter(void *p = 0) {}
diff --git a/test/CodeGenCXX/homogeneous-aggregates.cpp b/test/CodeGenCXX/homogeneous-aggregates.cpp
index 67911c0d7f90..1338b25e21ae 100644
--- a/test/CodeGenCXX/homogeneous-aggregates.cpp
+++ b/test/CodeGenCXX/homogeneous-aggregates.cpp
@@ -47,7 +47,7 @@ D1 CC func_D1(D1 x) { return x; }
 // PPC: define [3 x double] @_Z7func_D22D2([3 x double] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc %struct.D2 @_Z7func_D22D2(%struct.D2 %x.coerce)
 // ARM64: define %struct.D2 @_Z7func_D22D2([3 x double] %x.coerce)
-// X64: define x86_vectorcallcc %struct.D2 @"\01_Z7func_D22D2@@24"(double %x.0, double %x.1, double %x.2)
+// X64: define x86_vectorcallcc %struct.D2 @"\01_Z7func_D22D2@@24"(%struct.D2 inreg %x.coerce)
 D2 CC func_D2(D2 x) { return x; }
 
 // PPC: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, [4 x i64] %x.coerce)
@@ -92,7 +92,7 @@ struct HVAWithEmptyBase : Float1, Empty, Float2 { float z; };
 void CC with_empty_base(HVAWithEmptyBase a) {}
 
 // FIXME: MSVC doesn't consider this an HVA because of the empty base.
-// X64: define x86_vectorcallcc void @"\01_Z15with_empty_base16HVAWithEmptyBase@@16"(float %a.0, float %a.1, float %a.2)
+// X64: define x86_vectorcallcc void @"\01_Z15with_empty_base16HVAWithEmptyBase@@16"(%struct.HVAWithEmptyBase inreg %a.coerce)
 
 struct HVAWithEmptyBitField : Float1, Float2 {
   int : 0; // Takes no space.
@@ -102,5 +102,5 @@ struct HVAWithEmptyBitField : Float1, Float2 {
 // PPC: define void @_Z19with_empty_bitfield20HVAWithEmptyBitField([3 x float] %a.coerce)
 // ARM64: define void @_Z19with_empty_bitfield20HVAWithEmptyBitField([3 x float] %a.coerce)
 // ARM32: define arm_aapcs_vfpcc void @_Z19with_empty_bitfield20HVAWithEmptyBitField(%struct.HVAWithEmptyBitField %a.coerce)
-// X64: define x86_vectorcallcc void @"\01_Z19with_empty_bitfield20HVAWithEmptyBitField@@16"(float %a.0, float %a.1, float %a.2)
+// X64: define x86_vectorcallcc void @"\01_Z19with_empty_bitfield20HVAWithEmptyBitField@@16"(%struct.HVAWithEmptyBitField inreg %a.coerce)
 void CC with_empty_bitfield(HVAWithEmptyBitField a) {}
diff --git a/test/CodeGenCXX/ubsan-vtable-checks.cpp b/test/CodeGenCXX/ubsan-vtable-checks.cpp
index 80af77d4ea6d..e684ae9180f1 100644
--- a/test/CodeGenCXX/ubsan-vtable-checks.cpp
+++ b/test/CodeGenCXX/ubsan-vtable-checks.cpp
@@ -21,7 +21,7 @@ int get_v(T* t) {
   // CHECK-NULL-NOT: load {{.*}} (%struct.T*{{.*}})**, {{.*}} (%struct.T*{{.*}})***
   // CHECK-NULL: [[UBSAN_CMP_RES:%[0-9]+]] = icmp ne %struct.T* %{{[_a-z0-9]+}}, null
   // CHECK-NULL-NEXT: br i1 [[UBSAN_CMP_RES]], label %{{.*}}, label %{{.*}}
-  // CHECK-NULL: call void @__ubsan_handle_type_mismatch_abort
+  // CHECK-NULL: call void @__ubsan_handle_type_mismatch_v1_abort
   // Second, we check that vtable is actually loaded once the type check is done.
   // CHECK-NULL: load {{.*}} (%struct.T*{{.*}})**, {{.*}} (%struct.T*{{.*}})***
   return t->v();
diff --git a/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/.keep b/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/include/.keep b/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/include/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/lib/.keep b/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/lib/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/nvvm/libdevice/libdevice.compute_30.10.bc b/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/nvvm/libdevice/libdevice.compute_30.10.bc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/nvvm/libdevice/libdevice.compute_35.10.bc b/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/nvvm/libdevice/libdevice.compute_35.10.bc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Driver/avr-toolchain.c b/test/Driver/avr-toolchain.c
new file mode 100644
index 000000000000..46a3c10fa3a1
--- /dev/null
+++ b/test/Driver/avr-toolchain.c
@@ -0,0 +1,4 @@
+// A basic clang -cc1 command-line.
+
+// RUN: %clang %s -### -no-canonical-prefixes -target avr 2>&1 | FileCheck -check-prefix=CC1 %s
+// CC1: clang{{.*}} "-cc1" "-triple" "avr"
diff --git a/test/Driver/cuda-version-check.cu b/test/Driver/cuda-version-check.cu
index cb2ac7994f75..46ca72f2ea0c 100644
--- a/test/Driver/cuda-version-check.cu
+++ b/test/Driver/cuda-version-check.cu
@@ -2,40 +2,40 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang -v -### --cuda-gpu-arch=sm_20 --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_20 --sysroot=%S/Inputs/CUDA 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
-// RUN: %clang -v -### --cuda-gpu-arch=sm_20 --sysroot=%S/Inputs/CUDA_80 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_20 --sysroot=%S/Inputs/CUDA_80 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
-// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA_80 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA_80 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
 
 // The installation at Inputs/CUDA is CUDA 7.0, which doesn't support sm_60.
-// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=ERR_SM60
 
 // This should only complain about sm_60, not sm_35.
-// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_35 \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_35 \
 // RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=ERR_SM60 --check-prefix=OK_SM35
 
 // We should get two errors here, one for sm_60 and one for sm_61.
-// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_61 \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_61 \
 // RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=ERR_SM60 --check-prefix=ERR_SM61
 
 // We should still get an error if we pass -nocudainc, because this compilation
 // would invoke ptxas, and we do a version check on that, too.
-// RUN: %clang -v -### --cuda-gpu-arch=sm_60 -nocudainc --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 -nocudainc --sysroot=%S/Inputs/CUDA 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=ERR_SM60
 
 // If with -nocudainc and -E, we don't touch the CUDA install, so we
 // shouldn't get an error.
-// RUN: %clang -v -### -E --cuda-device-only --cuda-gpu-arch=sm_60 -nocudainc \
+// RUN: %clang --target=x86_64-linux -v -### -E --cuda-device-only --cuda-gpu-arch=sm_60 -nocudainc \
 // RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
 
 // --no-cuda-version-check should suppress all of these errors.
-// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA 2>&1 \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA 2>&1 \
 // RUN:    --no-cuda-version-check %s | \
 // RUN:    FileCheck %s --check-prefix=OK
 
@@ -43,9 +43,9 @@
 // therefore we should not get an error in host-only mode. We use the -S here
 // to avoid the error being produced in case by the assembler tool, which does
 // the same check.
-// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --cuda-host-only --sysroot=%S/Inputs/CUDA -S 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-host-only --sysroot=%S/Inputs/CUDA -S 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
-// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --cuda-device-only --sysroot=%S/Inputs/CUDA -S 2>&1 %s | \
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-device-only --sysroot=%S/Inputs/CUDA -S 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=ERR_SM60
 
 // OK-NOT: error: GPU arch
diff --git a/test/Driver/cuda-windows.cu b/test/Driver/cuda-windows.cu
new file mode 100644
index 000000000000..1d67710647c0
--- /dev/null
+++ b/test/Driver/cuda-windows.cu
@@ -0,0 +1,14 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+//
+// RUN: %clang -v --target=i386-pc-windows-msvc \
+// RUN:   --sysroot=%S/Inputs/CUDA-windows 2>&1 %s -### | FileCheck %s
+// RUN: %clang -v --target=i386-pc-windows-mingw32 \
+// RUN:   --sysroot=%S/Inputs/CUDA-windows 2>&1 %s -### | FileCheck %s
+
+// CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0
+// CHECK: "-cc1" "-triple" "nvptx-nvidia-cuda"
+// CHECK-SAME: "-fms-extensions"
+// CHECK-SAME: "-fms-compatibility"
+// CHECK-SAME: "-fms-compatibility-version=
diff --git a/test/Index/complete-block-properties.m b/test/Index/complete-block-properties.m
index d166147294e1..4697703c8e5c 100644
--- a/test/Index/complete-block-properties.m
+++ b/test/Index/complete-block-properties.m
@@ -43,7 +43,7 @@ typedef int (^BarBlock)(int *);
 //CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType void}{TypedText block}{LeftParen (}{RightParen )} (35)
 //CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType void (^)()}{TypedText block}{Equal  = }{Placeholder ^(void)} (38)
 //CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType Foo}{TypedText blocker}{LeftParen (}{Placeholder int x}{Comma , }{Placeholder Foo y}{Comma , }{Placeholder ^(Foo *someParameter)foo}{RightParen )} (35)
-//CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType Foo (^)(int, Foo, FooBlock)}{TypedText blocker}{Equal  = }{Placeholder ^Foo(int x, Foo y, FooBlock foo)} (38)
+//CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType Foo (^)(int, Foo, FooBlock)}{TypedText blocker}{Equal  = }{Placeholder ^Foo(int x, Foo y, FooBlock foo)} (32)
 //CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType int}{TypedText foo} (35)
 //CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType void}{TypedText fooBlock}{LeftParen (}{Placeholder Foo *someParameter}{RightParen )} (35)
 //CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType Test *}{TypedText getObject}{LeftParen (}{Placeholder int index}{RightParen )} (35)
diff --git a/test/Index/complete-block-property-assignment.m b/test/Index/complete-block-property-assignment.m
index ced3b7fa1302..908e18629528 100644
--- a/test/Index/complete-block-property-assignment.m
+++ b/test/Index/complete-block-property-assignment.m
@@ -15,6 +15,7 @@ typedef void (^FooBlock)(Foo *someParameter);
 @interface Test : Obj
 @property (readwrite, nonatomic, copy) FooBlock onEventHandler;
 @property (readonly, nonatomic, copy) void (^onReadonly)(int *someParameter);
+@property (readwrite, nonatomic, copy) int (^processEvent)(int eventCode);
 @property (readonly, nonatomic, strong) Obj *obj;
 @end
 
@@ -29,10 +30,10 @@ typedef void (^FooBlock)(Foo *someParameter);
   SELFY.foo = 2
 }
 
-// RUN: c-index-test -code-completion-at=%s:26:8 %s | FileCheck -check-prefix=CHECK-CC1 %s
-// RUN: c-index-test -code-completion-at=%s:27:27 %s | FileCheck -check-prefix=CHECK-CC1 %s
-// RUN: c-index-test -code-completion-at=%s:28:22 %s | FileCheck -check-prefix=CHECK-CC1 %s
-// RUN: c-index-test -code-completion-at=%s:29:9 %s | FileCheck -check-prefix=CHECK-CC1 %s
+// RUN: c-index-test -code-completion-at=%s:27:8 %s | FileCheck -check-prefix=CHECK-CC1 %s
+// RUN: c-index-test -code-completion-at=%s:28:27 %s | FileCheck -check-prefix=CHECK-CC1 %s
+// RUN: c-index-test -code-completion-at=%s:29:22 %s | FileCheck -check-prefix=CHECK-CC1 %s
+// RUN: c-index-test -code-completion-at=%s:30:9 %s | FileCheck -check-prefix=CHECK-CC1 %s
 // CHECK-CC1: ObjCPropertyDecl:{ResultType int}{TypedText foo} (35)
 // CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType Obj *}{TypedText obj} (35)
 // CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType void}{TypedText onAction}{LeftParen (}{Placeholder Obj *object}{RightParen )} (35)
@@ -40,6 +41,8 @@ typedef void (^FooBlock)(Foo *someParameter);
 // CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType void}{TypedText onEventHandler}{LeftParen (}{Placeholder Foo *someParameter}{RightParen )} (35)
 // CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType FooBlock}{TypedText onEventHandler}{Equal  = }{Placeholder ^(Foo *someParameter)} (38)
 // CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType void}{TypedText onReadonly}{LeftParen (}{Placeholder int *someParameter}{RightParen )} (35)
+// CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType int}{TypedText processEvent}{LeftParen (}{Placeholder int eventCode}{RightParen )} (35)
+// CHECK-CC1-NEXT: ObjCPropertyDecl:{ResultType int (^)(int)}{TypedText processEvent}{Equal  = }{Placeholder ^int(int eventCode)} (32)
 
 - (void) takeInt:(int)x { }
 
@@ -53,16 +56,17 @@ typedef void (^FooBlock)(Foo *someParameter);
   return self.foo;
 }
 
-// RUN: c-index-test -code-completion-at=%s:47:9 %s | FileCheck -check-prefix=CHECK-NO %s
-// RUN: c-index-test -code-completion-at=%s:48:16 %s | FileCheck -check-prefix=CHECK-NO %s
-// RUN: c-index-test -code-completion-at=%s:49:23 %s | FileCheck -check-prefix=CHECK-NO %s
-// RUN: c-index-test -code-completion-at=%s:50:12 %s | FileCheck -check-prefix=CHECK-NO %s
-// RUN: c-index-test -code-completion-at=%s:51:15 %s | FileCheck -check-prefix=CHECK-NO %s
-// RUN: c-index-test -code-completion-at=%s:53:15 %s | FileCheck -check-prefix=CHECK-NO %s
+// RUN: c-index-test -code-completion-at=%s:50:9 %s | FileCheck -check-prefix=CHECK-NO %s
+// RUN: c-index-test -code-completion-at=%s:51:16 %s | FileCheck -check-prefix=CHECK-NO %s
+// RUN: c-index-test -code-completion-at=%s:52:23 %s | FileCheck -check-prefix=CHECK-NO %s
+// RUN: c-index-test -code-completion-at=%s:53:12 %s | FileCheck -check-prefix=CHECK-NO %s
+// RUN: c-index-test -code-completion-at=%s:54:15 %s | FileCheck -check-prefix=CHECK-NO %s
+// RUN: c-index-test -code-completion-at=%s:56:15 %s | FileCheck -check-prefix=CHECK-NO %s
 // CHECK-NO: ObjCPropertyDecl:{ResultType int}{TypedText foo} (35)
 // CHECK-NO-NEXT: ObjCPropertyDecl:{ResultType Obj *}{TypedText obj} (35)
 // CHECK-NO-NEXT: ObjCPropertyDecl:{ResultType void (^)(Obj *)}{TypedText onAction} (35)
 // CHECK-NO-NEXT: ObjCPropertyDecl:{ResultType FooBlock}{TypedText onEventHandler} (35)
 // CHECK-NO-NEXT: ObjCPropertyDecl:{ResultType void (^)(int *)}{TypedText onReadonly} (35)
+// CHECK-NO-NEXT: ObjCPropertyDecl:{ResultType int (^)(int)}{TypedText processEvent} (35)
 
 @end
diff --git a/test/OpenMP/nvptx_target_codegen.cpp b/test/OpenMP/nvptx_target_codegen.cpp
index 287089d7c45e..59c4d5b277ce 100644
--- a/test/OpenMP/nvptx_target_codegen.cpp
+++ b/test/OpenMP/nvptx_target_codegen.cpp
@@ -8,9 +8,6 @@
 #ifndef HEADER
 #define HEADER
 
-// CHECK-DAG: [[OMP_NT:@.+]] = common addrspace(3) global i32 0
-// CHECK-DAG: [[OMP_WID:@.+]] = common addrspace(3) global i64 0
-
 template<typename tx, typename ty>
 struct TT{
   tx X;
@@ -26,19 +23,22 @@ int foo(int n) {
   double cn[5][n];
   TT<long long, char> d;
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l87}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l90}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -54,31 +54,34 @@ int foo(int n) {
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l87]]()
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l90]]()
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
   // CHECK: {{call|invoke}} void [[T1]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: br label {{%?}}[[EXIT:.+]]
   //
-  // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
-  // CHECK: br label {{%?}}[[TERM:.+]]
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[MASTER]]
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
+  //
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
@@ -93,19 +96,22 @@ int foo(int n) {
   {
   }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l158}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l167}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -121,35 +127,38 @@ int foo(int n) {
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l158]](i[[SZ:32|64]] [[ARG1:%[^)]+]])
+  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l167]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]])
   // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]],
   // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]],
   // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void [[T3]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: {{call|invoke}} void [[T2]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
   // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
   // CHECK: load i16, i16* [[AA_CADDR]],
-  // CHECK: br label {{%?}}[[TERM:.+]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
@@ -160,19 +169,22 @@ int foo(int n) {
     aa += 1;
   }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l261}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l276}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -188,7 +200,7 @@ int foo(int n) {
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+foo.+l261]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l276]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:    [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:    [[LOCAL_B:%.+]] = alloca [10 x float]*
@@ -219,26 +231,29 @@ int foo(int n) {
   // CHECK-DAG:    [[REF_CN:%.+]] = load double*, double** [[LOCAL_CN]],
   // CHECK-DAG:    [[REF_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCAL_D]],
   //
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void [[T4]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: {{call|invoke}} void [[T3]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
   // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
   //
   // Use captures.
   // CHECK-64-DAG:  load i32, i32* [[REF_A]]
@@ -249,10 +264,10 @@ int foo(int n) {
   // CHECK-DAG:  getelementptr inbounds double, double* [[REF_CN]], i[[SZ]] %{{.+}}
   // CHECK-DAG:     getelementptr inbounds [[TT]], [[TT]]* [[REF_D]], i32 0, i32 0
   //
-  // CHECK: br label {{%?}}[[TERM:.+]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
@@ -338,19 +353,22 @@ int bar(int n){
   return a;
 }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+l298}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+313}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -366,7 +384,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+static.+l298]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l313]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
@@ -382,36 +400,37 @@ int bar(int n){
   // CHECK-DAG:      [[REF_AAA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA]] to i8*
   // CHECK-DAG:      [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
   //
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void [[T5]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: {{call|invoke}} void [[T4]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
   //
-  // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
+  // CHECK: [[MASTER]]
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
   // CHECK-64-DAG: load i32, i32* [[REF_A]]
   // CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
   // CHECK-DAG:    load i16, i16* [[REF_AA]]
   // CHECK-DAG:    getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: br label {{%?}}[[TERM:.+]]
-  //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
@@ -420,19 +439,22 @@ int bar(int n){
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l316}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l331}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -448,7 +470,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+S1.+l316]](
+  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l331]](
   // Create local storage for each capture.
   // CHECK:       [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]*
   // CHECK:       [[LOCAL_B:%.+]] = alloca i[[SZ]]
@@ -466,35 +488,39 @@ int bar(int n){
   // CHECK-DAG:   [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]],
   // CHECK-DAG:   [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]],
   // CHECK-DAG:   [[REF_C:%.+]] = load i16*, i16** [[LOCAL_C]],
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void [[T6]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: {{call|invoke}} void [[T5]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
   // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
   // Use captures.
   // CHECK-DAG:   getelementptr inbounds [[S1]], [[S1]]* [[REF_THIS]], i32 0, i32 0
   // CHECK-64-DAG:load i32, i32* [[REF_B]]
   // CHECK-32-DAG:load i32, i32* [[LOCAL_B]]
   // CHECK-DAG:   getelementptr inbounds i16, i16* [[REF_C]], i[[SZ]] %{{.+}}
-  // CHECK: br label {{%?}}[[TERM:.+]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
@@ -503,19 +529,22 @@ int bar(int n){
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l281}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l296}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -531,7 +560,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T7:@__omp_offloading_.+template.+l281]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l296]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
@@ -544,36 +573,39 @@ int bar(int n){
   // CHECK-DAG:   [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
   // CHECK-DAG:   [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
   //
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void [[T7]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: {{call|invoke}} void [[T6]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
   // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
   //
   // CHECK-64-DAG: load i32, i32* [[REF_A]]
   // CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
   // CHECK-DAG:    load i16, i16* [[REF_AA]]
   // CHECK-DAG:    getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
   //
-  // CHECK: br label {{%?}}[[TERM:.+]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
diff --git a/test/OpenMP/target_codegen.cpp b/test/OpenMP/target_codegen.cpp
index f263ebdd2fe3..b5e4b07cce04 100644
--- a/test/OpenMP/target_codegen.cpp
+++ b/test/OpenMP/target_codegen.cpp
@@ -22,11 +22,11 @@
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
 // CHECK-DAG: [[S1:%.+]] = type { double }
-// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]] }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
 // CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
 // CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
 
-// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}} }
+// TCHECK: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 }
 
 // We have 8 target regions, but only 7 that actually will generate offloading
 // code, only 6 will have mapped arguments, and only 4 have all-constant map
diff --git a/test/OpenMP/target_codegen_registration.cpp b/test/OpenMP/target_codegen_registration.cpp
index a440faff9158..f2721b77fec0 100644
--- a/test/OpenMP/target_codegen_registration.cpp
+++ b/test/OpenMP/target_codegen_registration.cpp
@@ -30,11 +30,11 @@
 // CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
 // CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
 // CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
-// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]] }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
 // CHECK-DAG: [[DEVTY:%.+]] = type { i8*, i8*, [[ENTTY]]*, [[ENTTY]]* }
 // CHECK-DAG: [[DSCTY:%.+]] = type { i32, [[DEVTY]]*, [[ENTTY]]*, [[ENTTY]]* }
 
-// TCHECK:    [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]] }
+// TCHECK:    [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
 
 // CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
 // CHECK-DAG: [[A2:@.+]] = global [[SA]]
@@ -100,54 +100,54 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// CHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY3:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR3]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY4:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR4]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY5:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR5]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY6:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR6]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY7:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR7]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY8:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR8]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY9:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR9]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY10:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR10]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY11:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR11]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
+// TCHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
 
 // CHECK: [[ENTBEGIN:@.+]] = external constant [[ENTTY]]
 // CHECK: [[ENTEND:@.+]] = external constant [[ENTTY]]
diff --git a/test/OpenMP/teams_distribute_collapse_messages.cpp b/test/OpenMP/teams_distribute_collapse_messages.cpp
index 9ce58e0b0650..37c10e5986bf 100644
--- a/test/OpenMP/teams_distribute_collapse_messages.cpp
+++ b/test/OpenMP/teams_distribute_collapse_messages.cpp
@@ -66,7 +66,8 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i-ST];
 
-#pragma omp distribute collapse (S) // expected-error {{'S' does not refer to a value}}
+#pragma omp target
+#pragma omp teams distribute collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i-ST];
 
diff --git a/test/Preprocessor/cuda-types.cu b/test/Preprocessor/cuda-types.cu
index 2b6160b8d6c7..5f7b91655cdf 100644
--- a/test/Preprocessor/cuda-types.cu
+++ b/test/Preprocessor/cuda-types.cu
@@ -28,3 +28,19 @@
 // RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
 // RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/powerpc64-device-defines-filtered
 // RUN: diff %T/powerpc64-host-defines-filtered %T/powerpc64-device-defines-filtered
+
+// RUN: %clang --cuda-host-only -nocudainc -target i386-windows-msvc -x cuda -E -dM -o - /dev/null \
+// RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/i386-msvc-host-defines-filtered
+// RUN: %clang --cuda-device-only -nocudainc -nocudalib -target i386-windows-msvc -x cuda -E -dM -o - /dev/null \
+// RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/i386-msvc-device-defines-filtered
+// RUN: diff %T/i386-msvc-host-defines-filtered %T/i386-msvc-device-defines-filtered
+
+// RUN: %clang --cuda-host-only -nocudainc -target x86_64-windows-msvc -x cuda -E -dM -o - /dev/null \
+// RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/x86_64-msvc-host-defines-filtered
+// RUN: %clang --cuda-device-only -nocudainc -nocudalib -target x86_64-windows-msvc -x cuda -E -dM -o - /dev/null \
+// RUN:   | grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)\|define __GCC_ATOMIC' \
+// RUN:   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/x86_64-msvc-device-defines-filtered
+// RUN: diff %T/x86_64-msvc-host-defines-filtered %T/x86_64-msvc-device-defines-filtered
diff --git a/test/Preprocessor/init.c b/test/Preprocessor/init.c
index b003404df6ff..8b8901931e7a 100644
--- a/test/Preprocessor/init.c
+++ b/test/Preprocessor/init.c
@@ -9189,3 +9189,174 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple x86_64-windows-cygnus < /dev/null | FileCheck -match-full-lines -check-prefix CYGWIN-X64 %s
 // CYGWIN-X64: #define __USER_LABEL_PREFIX__
 
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=avr \
+// RUN:   < /dev/null \
+// RUN:   | FileCheck -match-full-lines -check-prefix=AVR %s
+//
+// AVR:#define __ATOMIC_ACQUIRE 2
+// AVR:#define __ATOMIC_ACQ_REL 4
+// AVR:#define __ATOMIC_CONSUME 1
+// AVR:#define __ATOMIC_RELAXED 0
+// AVR:#define __ATOMIC_RELEASE 3
+// AVR:#define __ATOMIC_SEQ_CST 5
+// AVR:#define __AVR__ 1
+// AVR:#define __BIGGEST_ALIGNMENT__ 1
+// AVR:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// AVR:#define __CHAR16_TYPE__ unsigned int
+// AVR:#define __CHAR32_TYPE__ long unsigned int
+// AVR:#define __CHAR_BIT__ 8
+// AVR:#define __DBL_DECIMAL_DIG__ 9
+// AVR:#define __DBL_DENORM_MIN__ 1.40129846e-45
+// AVR:#define __DBL_DIG__ 6
+// AVR:#define __DBL_EPSILON__ 1.19209290e-7
+// AVR:#define __DBL_HAS_DENORM__ 1
+// AVR:#define __DBL_HAS_INFINITY__ 1
+// AVR:#define __DBL_HAS_QUIET_NAN__ 1
+// AVR:#define __DBL_MANT_DIG__ 24
+// AVR:#define __DBL_MAX_10_EXP__ 38
+// AVR:#define __DBL_MAX_EXP__ 128
+// AVR:#define __DBL_MAX__ 3.40282347e+38
+// AVR:#define __DBL_MIN_10_EXP__ (-37)
+// AVR:#define __DBL_MIN_EXP__ (-125)
+// AVR:#define __DBL_MIN__ 1.17549435e-38
+// AVR:#define __FINITE_MATH_ONLY__ 0
+// AVR:#define __FLT_DECIMAL_DIG__ 9
+// AVR:#define __FLT_DENORM_MIN__ 1.40129846e-45F
+// AVR:#define __FLT_DIG__ 6
+// AVR:#define __FLT_EPSILON__ 1.19209290e-7F
+// AVR:#define __FLT_EVAL_METHOD__ 0
+// AVR:#define __FLT_HAS_DENORM__ 1
+// AVR:#define __FLT_HAS_INFINITY__ 1
+// AVR:#define __FLT_HAS_QUIET_NAN__ 1
+// AVR:#define __FLT_MANT_DIG__ 24
+// AVR:#define __FLT_MAX_10_EXP__ 38
+// AVR:#define __FLT_MAX_EXP__ 128
+// AVR:#define __FLT_MAX__ 3.40282347e+38F
+// AVR:#define __FLT_MIN_10_EXP__ (-37)
+// AVR:#define __FLT_MIN_EXP__ (-125)
+// AVR:#define __FLT_MIN__ 1.17549435e-38F
+// AVR:#define __FLT_RADIX__ 2
+// AVR:#define __GCC_ATOMIC_BOOL_LOCK_FREE 1
+// AVR:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 1
+// AVR:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 1
+// AVR:#define __GCC_ATOMIC_CHAR_LOCK_FREE 1
+// AVR:#define __GCC_ATOMIC_INT_LOCK_FREE 1
+// AVR:#define __GCC_ATOMIC_LLONG_LOCK_FREE 1
+// AVR:#define __GCC_ATOMIC_LONG_LOCK_FREE 1
+// AVR:#define __GCC_ATOMIC_POINTER_LOCK_FREE 1
+// AVR:#define __GCC_ATOMIC_SHORT_LOCK_FREE 1
+// AVR:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// AVR:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 1
+// AVR:#define __GXX_ABI_VERSION 1002
+// AVR:#define __INT16_C_SUFFIX__ 
+// AVR:#define __INT16_MAX__ 32767
+// AVR:#define __INT16_TYPE__ short
+// AVR:#define __INT32_C_SUFFIX__ L
+// AVR:#define __INT32_MAX__ 2147483647L
+// AVR:#define __INT32_TYPE__ long int
+// AVR:#define __INT64_C_SUFFIX__ LL
+// AVR:#define __INT64_MAX__ 9223372036854775807LL
+// AVR:#define __INT64_TYPE__ long long int
+// AVR:#define __INT8_C_SUFFIX__ 
+// AVR:#define __INT8_MAX__ 127
+// AVR:#define __INT8_TYPE__ signed char
+// AVR:#define __INTMAX_C_SUFFIX__ LL
+// AVR:#define __INTMAX_MAX__ 9223372036854775807LL
+// AVR:#define __INTMAX_TYPE__ long long int
+// AVR:#define __INTPTR_MAX__ 32767
+// AVR:#define __INTPTR_TYPE__ int
+// AVR:#define __INT_FAST16_MAX__ 32767
+// AVR:#define __INT_FAST16_TYPE__ int
+// AVR:#define __INT_FAST32_MAX__ 2147483647L
+// AVR:#define __INT_FAST32_TYPE__ long int
+// AVR:#define __INT_FAST64_MAX__ 9223372036854775807LL
+// AVR:#define __INT_FAST64_TYPE__ long long int
+// AVR:#define __INT_FAST8_MAX__ 127
+// AVR:#define __INT_FAST8_TYPE__ signed char
+// AVR:#define __INT_LEAST16_MAX__ 32767
+// AVR:#define __INT_LEAST16_TYPE__ int
+// AVR:#define __INT_LEAST32_MAX__ 2147483647L
+// AVR:#define __INT_LEAST32_TYPE__ long int
+// AVR:#define __INT_LEAST64_MAX__ 9223372036854775807LL
+// AVR:#define __INT_LEAST64_TYPE__ long long int
+// AVR:#define __INT_LEAST8_MAX__ 127
+// AVR:#define __INT_LEAST8_TYPE__ signed char
+// AVR:#define __INT_MAX__ 32767
+// AVR:#define __LDBL_DECIMAL_DIG__ 9
+// AVR:#define __LDBL_DENORM_MIN__ 1.40129846e-45L
+// AVR:#define __LDBL_DIG__ 6
+// AVR:#define __LDBL_EPSILON__ 1.19209290e-7L
+// AVR:#define __LDBL_HAS_DENORM__ 1
+// AVR:#define __LDBL_HAS_INFINITY__ 1
+// AVR:#define __LDBL_HAS_QUIET_NAN__ 1
+// AVR:#define __LDBL_MANT_DIG__ 24
+// AVR:#define __LDBL_MAX_10_EXP__ 38
+// AVR:#define __LDBL_MAX_EXP__ 128
+// AVR:#define __LDBL_MAX__ 3.40282347e+38L
+// AVR:#define __LDBL_MIN_10_EXP__ (-37)
+// AVR:#define __LDBL_MIN_EXP__ (-125)
+// AVR:#define __LDBL_MIN__ 1.17549435e-38L
+// AVR:#define __LONG_LONG_MAX__ 9223372036854775807LL
+// AVR:#define __LONG_MAX__ 2147483647L
+// AVR:#define __NO_INLINE__ 1
+// AVR:#define __ORDER_BIG_ENDIAN__ 4321
+// AVR:#define __ORDER_LITTLE_ENDIAN__ 1234
+// AVR:#define __ORDER_PDP_ENDIAN__ 3412
+// AVR:#define __PRAGMA_REDEFINE_EXTNAME 1
+// AVR:#define __PTRDIFF_MAX__ 32767
+// AVR:#define __PTRDIFF_TYPE__ int
+// AVR:#define __SCHAR_MAX__ 127
+// AVR:#define __SHRT_MAX__ 32767
+// AVR:#define __SIG_ATOMIC_MAX__ 127
+// AVR:#define __SIG_ATOMIC_WIDTH__ 8
+// AVR:#define __SIZEOF_DOUBLE__ 4
+// AVR:#define __SIZEOF_FLOAT__ 4
+// AVR:#define __SIZEOF_INT__ 2
+// AVR:#define __SIZEOF_LONG_DOUBLE__ 4
+// AVR:#define __SIZEOF_LONG_LONG__ 8
+// AVR:#define __SIZEOF_LONG__ 4
+// AVR:#define __SIZEOF_POINTER__ 2
+// AVR:#define __SIZEOF_PTRDIFF_T__ 2
+// AVR:#define __SIZEOF_SHORT__ 2
+// AVR:#define __SIZEOF_SIZE_T__ 2
+// AVR:#define __SIZEOF_WCHAR_T__ 2
+// AVR:#define __SIZEOF_WINT_T__ 2
+// AVR:#define __SIZE_MAX__ 65535U
+// AVR:#define __SIZE_TYPE__ unsigned int
+// AVR:#define __STDC__ 1
+// AVR:#define __UINT16_MAX__ 65535U
+// AVR:#define __UINT16_TYPE__ unsigned short
+// AVR:#define __UINT32_C_SUFFIX__ UL
+// AVR:#define __UINT32_MAX__ 4294967295UL
+// AVR:#define __UINT32_TYPE__ long unsigned int
+// AVR:#define __UINT64_C_SUFFIX__ ULL
+// AVR:#define __UINT64_MAX__ 18446744073709551615ULL
+// AVR:#define __UINT64_TYPE__ long long unsigned int
+// AVR:#define __UINT8_C_SUFFIX__ 
+// AVR:#define __UINT8_MAX__ 255
+// AVR:#define __UINT8_TYPE__ unsigned char
+// AVR:#define __UINTMAX_C_SUFFIX__ ULL
+// AVR:#define __UINTMAX_MAX__ 18446744073709551615ULL
+// AVR:#define __UINTMAX_TYPE__ long long unsigned int
+// AVR:#define __UINTPTR_MAX__ 65535U
+// AVR:#define __UINTPTR_TYPE__ unsigned int
+// AVR:#define __UINT_FAST16_MAX__ 65535U
+// AVR:#define __UINT_FAST16_TYPE__ unsigned int
+// AVR:#define __UINT_FAST32_MAX__ 4294967295UL
+// AVR:#define __UINT_FAST32_TYPE__ long unsigned int
+// AVR:#define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// AVR:#define __UINT_FAST64_TYPE__ long long unsigned int
+// AVR:#define __UINT_FAST8_MAX__ 255
+// AVR:#define __UINT_FAST8_TYPE__ unsigned char
+// AVR:#define __UINT_LEAST16_MAX__ 65535U
+// AVR:#define __UINT_LEAST16_TYPE__ unsigned int
+// AVR:#define __UINT_LEAST32_MAX__ 4294967295UL
+// AVR:#define __UINT_LEAST32_TYPE__ long unsigned int
+// AVR:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// AVR:#define __UINT_LEAST64_TYPE__ long long unsigned int
+// AVR:#define __UINT_LEAST8_MAX__ 255
+// AVR:#define __UINT_LEAST8_TYPE__ unsigned char
+// AVR:#define __USER_LABEL_PREFIX__ 
+// AVR:#define __WCHAR_MAX__ 32767
+// AVR:#define __WCHAR_TYPE__ int
+// AVR:#define __WINT_TYPE__ int
diff --git a/test/Sema/warn-cast-align.c b/test/Sema/warn-cast-align.c
index e8f85bc14d8d..389c0c17d2f7 100644
--- a/test/Sema/warn-cast-align.c
+++ b/test/Sema/warn-cast-align.c
@@ -59,3 +59,11 @@ void test4() {
   i = (int *)&s.s0;
   i = (int *)a;
 }
+
+// No warnings.
+typedef int (*FnTy)(void);
+unsigned int func5(void);
+
+FnTy test5(void) {
+  return (FnTy)&func5;
+}
diff --git a/test/Sema/warn-strict-prototypes.m b/test/Sema/warn-strict-prototypes.m
index cbb01a1f7b21..4567dab01930 100644
--- a/test/Sema/warn-strict-prototypes.m
+++ b/test/Sema/warn-strict-prototypes.m
@@ -14,7 +14,8 @@ void foo() {
   void (^block)() = // expected-warning {{this function declaration is not a prototype}}
                     ^void(int arg) { // no warning
   };
-  void (^block2)(void) = // no warning
-                         ^void() { // expected-warning {{this function declaration is not a prototype}}
+  void (^block2)(void) = ^void() { // no warning
+  };
+  void (^block3)(void) = ^ { // no warning
   };
 }
diff --git a/test/Sema/warn-thread-safety-analysis.c b/test/Sema/warn-thread-safety-analysis.c
index a0c4026b9136..425ce4c196a6 100644
--- a/test/Sema/warn-thread-safety-analysis.c
+++ b/test/Sema/warn-thread-safety-analysis.c
@@ -127,3 +127,7 @@ int main() {
 
   return 0;
 }
+
+// We had a problem where we'd skip all attributes that follow a late-parsed
+// attribute in a single __attribute__.
+void run() __attribute__((guarded_by(mu1), guarded_by(mu1))); // expected-warning 2{{only applies to fields and global variables}}
diff --git a/test/SemaCUDA/attr-declspec.cu b/test/SemaCUDA/attr-declspec.cu
new file mode 100644
index 000000000000..dda12ce8a51f
--- /dev/null
+++ b/test/SemaCUDA/attr-declspec.cu
@@ -0,0 +1,34 @@
+// Test the __declspec spellings of CUDA attributes.
+//
+// RUN: %clang_cc1 -fsyntax-only -fms-extensions -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fms-extensions -fcuda-is-device -verify %s
+// Now pretend that we're compiling a C file. There should be warnings.
+// RUN: %clang_cc1 -DEXPECT_WARNINGS -fms-extensions -fsyntax-only -verify -x c %s
+
+#if defined(EXPECT_WARNINGS)
+// expected-warning@+12 {{'__device__' attribute ignored}}
+// expected-warning@+12 {{'__global__' attribute ignored}}
+// expected-warning@+12 {{'__constant__' attribute ignored}}
+// expected-warning@+12 {{'__shared__' attribute ignored}}
+// expected-warning@+12 {{'__host__' attribute ignored}}
+//
+// (Currently we don't for the other attributes. They are implemented with
+// IgnoredAttr, which is ignored irrespective of any LangOpts.)
+#else
+// expected-no-diagnostics
+#endif
+
+__declspec(__device__) void f_device();
+__declspec(__global__) void f_global();
+__declspec(__constant__) int* g_constant;
+__declspec(__shared__) float *g_shared;
+__declspec(__host__) void f_host();
+__declspec(__device_builtin__) void f_device_builtin();
+typedef __declspec(__device_builtin__) const void *t_device_builtin;
+enum __declspec(__device_builtin__) e_device_builtin {E};
+__declspec(__device_builtin__) int v_device_builtin;
+__declspec(__cudart_builtin__) void f_cudart_builtin();
+__declspec(__device_builtin_surface_type__) unsigned long long surface_var;
+__declspec(__device_builtin_texture_type__) unsigned long long texture_var;
+
+// Note that there's no __declspec spelling of nv_weak.
diff --git a/test/SemaCUDA/cuda-inherits-calling-conv.cu b/test/SemaCUDA/cuda-inherits-calling-conv.cu
new file mode 100644
index 000000000000..67c438fa621b
--- /dev/null
+++ b/test/SemaCUDA/cuda-inherits-calling-conv.cu
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=c++11 -triple i386-windows-msvc \
+// RUN:   -aux-triple nvptx-nvidia-cuda -fsyntax-only -verify %s
+
+// RUN: %clang_cc1 -std=c++11 -triple nvptx-nvidia-cuda \
+// RUN:   -aux-triple i386-windows-msvc -fsyntax-only \
+// RUN:   -fcuda-is-device -verify %s
+
+// RUN: %clang_cc1 -std=c++11 -triple nvptx-nvidia-cuda \
+// RUN:   -aux-triple x86_64-linux-gnu -fsyntax-only \
+// RUN:   -fcuda-is-device -verify -verify-ignore-unexpected=note \
+// RUN:   -DEXPECT_ERR %s
+
+// CUDA device code should inherit the host's calling conventions.
+
+template <class T>
+struct Foo;
+
+template <class T>
+struct Foo<T()> {};
+
+// On x86_64-linux-gnu, this is a redefinition of the template, because the
+// __fastcall calling convention doesn't exist (and is therefore ignored).
+#ifndef EXPECT_ERR
+// expected-no-diagnostics
+#else
+// expected-error@+4 {{redefinition of 'Foo}}
+// expected-warning@+3 {{calling convention '__fastcall' ignored}}
+#endif
+template <class T>
+struct Foo<T __fastcall()> {};
diff --git a/test/SemaCXX/constant-expression-cxx11.cpp b/test/SemaCXX/constant-expression-cxx11.cpp
index 581a524339e7..884f2f30c42f 100644
--- a/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1725,7 +1725,7 @@ namespace AfterError {
   constexpr int error() { // expected-error {{no return statement}}
     return foobar; // expected-error {{undeclared identifier}}
   }
-  constexpr int k = error(); // expected-error {{must be initialized by a constant expression}}
+  constexpr int k = error();
 }
 
 namespace std {
@@ -2030,7 +2030,7 @@ namespace PR21786 {
 
 namespace PR21859 {
   constexpr int Fun() { return; } // expected-error {{non-void constexpr function 'Fun' should return a value}}
-  constexpr int Var = Fun(); // expected-error {{constexpr variable 'Var' must be initialized by a constant expression}}
+  constexpr int Var = Fun();
 }
 
 struct InvalidRedef {
diff --git a/test/SemaCXX/conversion-function.cpp b/test/SemaCXX/conversion-function.cpp
index c725a0d5b7c1..531de818b680 100644
--- a/test/SemaCXX/conversion-function.cpp
+++ b/test/SemaCXX/conversion-function.cpp
@@ -440,7 +440,7 @@ namespace PR18234 {
 #endif
   } a;
   A::S s = a; // expected-error {{no viable conversion from 'struct A' to 'A::S'}}
-  A::E e = a; // expected-note {{here}}
+  A::E e = a;
   bool k1 = e == A::e; // expected-error {{no member named 'e'}}
   bool k2 = e.n == 0;
 }
diff --git a/test/SemaCXX/cxx0x-initializer-stdinitializerlist.cpp b/test/SemaCXX/cxx0x-initializer-stdinitializerlist.cpp
index 75c6734bce30..9b8fadd2f522 100644
--- a/test/SemaCXX/cxx0x-initializer-stdinitializerlist.cpp
+++ b/test/SemaCXX/cxx0x-initializer-stdinitializerlist.cpp
@@ -105,6 +105,7 @@ T deduce_ref(const std::initializer_list<T>&); // expected-note {{conflicting ty
 
 template<typename T, typename U> struct pair { pair(...); };
 template<typename T> void deduce_pairs(std::initializer_list<pair<T, typename T::type>>);
+// expected-note@-1 {{deduced type 'pair<[...], typename WithIntType::type>' of element of 1st parameter does not match adjusted type 'pair<[...], float>' of element of argument [with T = WithIntType]}}
 struct WithIntType { typedef int type; };
 
 template<typename ...T> void deduce_after_init_list_in_pack(void (*)(T...), T...); // expected-note {{<int, int> vs. <(no value), double>}}
@@ -123,7 +124,7 @@ void argument_deduction() {
   pair<WithIntType, int> pi;
   pair<WithIntType, float> pf;
   deduce_pairs({pi, pi, pi}); // ok
-  deduce_pairs({pi, pf, pi}); // FIXME: This should be rejected, as we fail to produce a type that exactly matches the argument type.
+  deduce_pairs({pi, pf, pi}); // expected-error {{no matching function}}
 
   deduce_after_init_list_in_pack((void(*)(int,int))0, {}, 0);
   deduce_after_init_list_in_pack((void(*)(int,int))0, {}, 0.0); // expected-error {{no matching function}}
@@ -298,9 +299,18 @@ namespace TemporaryInitListSourceRange_PR22367 {
 
 namespace ParameterPackNestedInitializerLists_PR23904c3 {
   template <typename ...T>
-  void f(std::initializer_list<std::initializer_list<T>> ...tt);
+  void f(std::initializer_list<std::initializer_list<T>> ...tt); // expected-note 2{{conflicting}} expected-note {{incomplete pack}}
 
-  void foo() { f({{0}}, {{'\0'}}); }
+  void foo() {
+    f({{0}}, {{'\0'}}); // ok, T = <int, char>
+    f({{0}, {'\0'}}); // expected-error {{no match}}
+    f({{0, '\0'}}); // expected-error {{no match}}
+
+    f({{0}}, {{{}}}); // expected-error {{no match}}
+    f({{0}}, {{{}, '\0'}}); // ok, T = <int, char>
+    f({{0}, {{}}}); // ok, T = <int>
+    f({{0, {}}}); // ok, T = <int>
+  }
 }
 
 namespace update_rbrace_loc_crash {
@@ -327,3 +337,13 @@ namespace update_rbrace_loc_crash {
     Explode<ContainsIncomplete, 4>([](int) {});
   }
 }
+
+namespace no_conversion_after_auto_list_deduction {
+  // We used to deduce 'auto' == 'std::initializer_list<X>' here, and then
+  // incorrectly accept the declaration of 'x'.
+  struct X { using T = std::initializer_list<X> X::*; operator T(); };
+  auto X::*x = { X() }; // expected-error {{from initializer list}}
+
+  struct Y { using T = std::initializer_list<Y>(*)(); operator T(); };
+  auto (*y)() = { Y() }; // expected-error {{from initializer list}}
+}
diff --git a/test/SemaCXX/cxx1z-decomposition.cpp b/test/SemaCXX/cxx1z-decomposition.cpp
index 735a9e1dfee0..d457ace5d844 100644
--- a/test/SemaCXX/cxx1z-decomposition.cpp
+++ b/test/SemaCXX/cxx1z-decomposition.cpp
@@ -65,4 +65,9 @@ void for_range() {
   }
 }
 
+int error_recovery() {
+  auto [foobar]; // expected-error {{requires an initializer}}
+  return foobar_; // expected-error {{undeclared identifier 'foobar_'}}
+}
+
 // FIXME: by-value array copies
diff --git a/test/SemaCXX/default-arg-closures.cpp b/test/SemaCXX/default-arg-closures.cpp
index e076cc05cd20..676bd486105f 100644
--- a/test/SemaCXX/default-arg-closures.cpp
+++ b/test/SemaCXX/default-arg-closures.cpp
@@ -4,16 +4,15 @@
 // instantiating and checking the semantics of default arguments. Make sure we
 // do that right.
 
-// FIXME: Don't diagnose this issue twice.
 template <typename T>
-struct DependentDefaultCtorArg { // expected-note {{in instantiation of default function argument}}
-  // expected-error@+1 2 {{type 'int' cannot be used prior to '::' because it has no members}}
+struct DependentDefaultCtorArg {
+  // expected-error@+1 {{type 'int' cannot be used prior to '::' because it has no members}}
   DependentDefaultCtorArg(int n = T::error);
 };
 struct
 __declspec(dllexport) // expected-note {{due to 'ExportDefaultCtorClosure' being dllexported}}
-ExportDefaultCtorClosure // expected-note {{implicit default constructor for 'ExportDefaultCtorClosure' first required here}}
-: DependentDefaultCtorArg<int> // expected-note {{in instantiation of template class}}
+ExportDefaultCtorClosure // expected-note {{in instantiation of default function argument expression for 'DependentDefaultCtorArg<int>' required here}} expected-note {{implicit default constructor for 'ExportDefaultCtorClosure' first required here}}
+: DependentDefaultCtorArg<int>
 {};
 
 template <typename T>
diff --git a/test/SemaCXX/dllexport.cpp b/test/SemaCXX/dllexport.cpp
index b4850fc03d9b..a3fed70ec958 100644
--- a/test/SemaCXX/dllexport.cpp
+++ b/test/SemaCXX/dllexport.cpp
@@ -741,6 +741,27 @@ struct __declspec(dllexport) ClassWithMultipleDefaultCtors {
   ClassWithMultipleDefaultCtors(int = 40) {} // expected-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
   ClassWithMultipleDefaultCtors(int = 30, ...) {} // expected-note{{declared here}}
 };
+template <typename T>
+struct ClassTemplateWithMultipleDefaultCtors {
+  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 40) {}      // expected-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
+  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 30, ...) {} // expected-note{{declared here}}
+};
+
+template <typename T> struct HasDefaults {
+  HasDefaults(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
+};
+template struct __declspec(dllexport) HasDefaults<char>;
+
+template struct
+__declspec(dllexport) // expected-note {{in instantiation of default function argument expression for 'HasDefaults<void>' required here}}
+HasDefaults<void>; // expected-note {{in instantiation of member function 'HasDefaults<void>::HasDefaults' requested here}}
+
+template <typename T> struct HasDefaults2 {
+  __declspec(dllexport) // expected-note {{in instantiation of default function argument expression for 'HasDefaults2<void>' required here}}
+  HasDefaults2(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
+};
+template struct HasDefaults2<void>; // expected-note {{in instantiation of member function 'HasDefaults2<void>::HasDefaults2' requested here}}
+
 #endif
 
 //===----------------------------------------------------------------------===//
diff --git a/test/SemaCXX/type-definition-in-specifier.cpp b/test/SemaCXX/type-definition-in-specifier.cpp
index 74ba058b4f12..2da649fdb0b8 100644
--- a/test/SemaCXX/type-definition-in-specifier.cpp
+++ b/test/SemaCXX/type-definition-in-specifier.cpp
@@ -59,10 +59,8 @@ struct s19018b {
 };
 
 struct pr18963 {
-  short bar5 (struct foo4 {} bar2); // expected-error{{'foo4' cannot be defined in a parameter type}} \
-                                    // expected-note{{declared here}}
-
-  long foo5 (float foo6 = foo4);  // expected-error{{'foo4' does not refer to a value}}
+  short bar5 (struct foo4 {} bar2); // expected-error{{'foo4' cannot be defined in a parameter type}}
+  long foo5 (float foo6 = foo4);
 };
 
 // expected-error@+2 {{cannot be defined in a parameter type}}
diff --git a/test/SemaObjC/block-omitted-return-type.m b/test/SemaObjC/block-omitted-return-type.m
index 20e32e01865e..93d5e05ea282 100644
--- a/test/SemaObjC/block-omitted-return-type.m
+++ b/test/SemaObjC/block-omitted-return-type.m
@@ -24,7 +24,7 @@
     return;
   };
   void (^simpleBlock5)() = ^ const void { //expected-error {{incompatible block pointer types initializing 'void (^)()' with an expression of type 'const void (^)(void)'}}
-    return;
+    return; // expected-warning@-1 {{function cannot return qualified void type 'const void'}}
   };
   void (^simpleBlock6)() = ^ const (void) { //expected-warning {{'const' qualifier on omitted return type '<dependent type>' has no effect}}
     return;
diff --git a/test/SemaOpenCL/extensions.cl b/test/SemaOpenCL/extensions.cl
index c27f3397cd79..6afb11e42a6a 100644
--- a/test/SemaOpenCL/extensions.cl
+++ b/test/SemaOpenCL/extensions.cl
@@ -22,6 +22,17 @@
 // RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -pedantic -fsyntax-only -cl-ext=-all -cl-ext=+cl_khr_fp64 -cl-ext=+cl_khr_fp16 -cl-ext=-cl_khr_fp64 -DNOFP64
 // RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -pedantic -fsyntax-only -cl-ext=-all -cl-ext=+cl_khr_fp64,-cl_khr_fp64,+cl_khr_fp16 -DNOFP64
 
+// Test with -finclude-default-header, which includes opencl-c.h. opencl-c.h
+// disables all extensions by default, but supported core extensions for a
+// particular OpenCL version must be re-enabled (for example, cl_khr_fp64 is
+// enabled by default with -cl-std=CL2.0).
+//
+// RUN: %clang_cc1 %s -triple amdgcn-unknown-unknown -verify -pedantic -fsyntax-only -cl-std=CL2.0 -finclude-default-header
+
+#ifdef _OPENCL_H_
+// expected-no-diagnostics
+#endif
+
 #ifdef FP64
 // expected-no-diagnostics
 #endif
@@ -33,6 +44,7 @@ void f1(double da) { // expected-error {{type 'double' requires cl_khr_fp64 exte
 }
 #endif
 
+#ifndef _OPENCL_H_
 int isnan(float x) {
     return __builtin_isnan(x);
 }
@@ -40,6 +52,7 @@ int isnan(float x) {
 int isfinite(float x) {
     return __builtin_isfinite(x);
 }
+#endif
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #ifdef NOFP64
diff --git a/test/SemaTemplate/deduction.cpp b/test/SemaTemplate/deduction.cpp
index 5695cab9a27e..2275a8b3b7ad 100644
--- a/test/SemaTemplate/deduction.cpp
+++ b/test/SemaTemplate/deduction.cpp
@@ -407,3 +407,38 @@ namespace overload_vs_pack {
     void test() { j(x, f, x); }
   }
 }
+
+namespace b29946541 {
+  template<typename> class A {};
+  template<typename T, typename U, template<typename, typename> class C>
+  void f(C<T, U>); // expected-note {{failed template argument deduction}}
+  void g(A<int> a) { f(a); } // expected-error {{no match}}
+}
+
+namespace deduction_from_empty_list {
+  template<int M, int N = 5> void f(int (&&)[N], int (&&)[N]) { // expected-note {{1 vs. 2}}
+    static_assert(M == N, "");
+  }
+
+  void test() {
+    f<5>({}, {});
+    f<1>({}, {0});
+    f<1>({0}, {});
+    f<1>({0}, {0});
+    f<1>({0}, {0, 1}); // expected-error {{no matching}}
+  }
+}
+
+namespace check_extended_pack {
+  template<typename T> struct X { typedef int type; };
+  template<typename ...T> void f(typename X<T>::type...);
+  template<typename T> void f(T, int, int);
+  void g() {
+    f<int>(0, 0, 0);
+  }
+
+  template<int, int*> struct Y {};
+  template<int ...N> void g(Y<N...>); // expected-note {{deduced non-type template argument does not have the same type as the corresponding template parameter ('int *' vs 'int')}}
+  int n;
+  void h() { g<0>(Y<0, &n>()); } // expected-error {{no matching function}}
+}
diff --git a/test/SemaTemplate/instantiate-local-class.cpp b/test/SemaTemplate/instantiate-local-class.cpp
index a61af7a5af38..eaff4c4bbc8d 100644
--- a/test/SemaTemplate/instantiate-local-class.cpp
+++ b/test/SemaTemplate/instantiate-local-class.cpp
@@ -475,3 +475,14 @@ namespace rdar23721638 {
   }
   template void bar<A>(); // expected-note {{in instantiation}}
 }
+
+namespace anon_union_default_member_init {
+  template<typename T> void f() {
+    struct S {
+      union {
+        int i = 0;
+      };
+    };
+  }
+  void g() { f<int>(); }
+}
diff --git a/tools/c-index-test/core_main.cpp b/tools/c-index-test/core_main.cpp
index 3e4052c93ef5..8976d9134916 100644
--- a/tools/c-index-test/core_main.cpp
+++ b/tools/c-index-test/core_main.cpp
@@ -140,8 +140,7 @@ static bool printSourceSymbols(ArrayRef<const char *> Args) {
   ArgsWithProgName.append(Args.begin(), Args.end());
   IntrusiveRefCntPtr<DiagnosticsEngine>
     Diags(CompilerInstance::createDiagnostics(new DiagnosticOptions));
-  IntrusiveRefCntPtr<CompilerInvocation>
-    CInvok(createInvocationFromCommandLine(ArgsWithProgName, Diags));
+  auto CInvok = createInvocationFromCommandLine(ArgsWithProgName, Diags);
   if (!CInvok)
     return true;
 
@@ -153,7 +152,7 @@ static bool printSourceSymbols(ArrayRef<const char *> Args) {
 
   auto PCHContainerOps = std::make_shared<PCHContainerOperations>();
   std::unique_ptr<ASTUnit> Unit(ASTUnit::LoadFromCompilerInvocationAction(
-      CInvok.get(), PCHContainerOps, Diags, IndexAction.get()));
+      std::move(CInvok), PCHContainerOps, Diags, IndexAction.get()));
 
   if (!Unit)
     return true;
diff --git a/tools/clang-import-test/clang-import-test.cpp b/tools/clang-import-test/clang-import-test.cpp
index 47598fc91813..33190af4bf45 100644
--- a/tools/clang-import-test/clang-import-test.cpp
+++ b/tools/clang-import-test/clang-import-test.cpp
@@ -157,7 +157,7 @@ BuildCompilerInstance(ArrayRef<const char *> ClangArgv) {
   Inv->getCodeGenOpts().setDebugInfo(codegenoptions::FullDebugInfo);
   Inv->getTargetOpts().Triple = llvm::sys::getDefaultTargetTriple();
 
-  Ins->setInvocation(Inv.release());
+  Ins->setInvocation(std::move(Inv));
 
   TargetInfo *TI = TargetInfo::CreateTargetInfo(
       Ins->getDiagnostics(), Ins->getInvocation().TargetOpts);
diff --git a/tools/diagtool/ShowEnabledWarnings.cpp b/tools/diagtool/ShowEnabledWarnings.cpp
index abbd3afbd58c..e6ea786a9ade 100644
--- a/tools/diagtool/ShowEnabledWarnings.cpp
+++ b/tools/diagtool/ShowEnabledWarnings.cpp
@@ -67,8 +67,8 @@ createDiagnostics(unsigned int argc, char **argv) {
   SmallVector<const char *, 4> Args;
   Args.push_back("diagtool");
   Args.append(argv, argv + argc);
-  std::unique_ptr<CompilerInvocation> Invocation(
-      createInvocationFromCommandLine(Args, InterimDiags));
+  std::unique_ptr<CompilerInvocation> Invocation =
+      createInvocationFromCommandLine(Args, InterimDiags);
   if (!Invocation)
     return nullptr;
 
diff --git a/tools/libclang/CIndex.cpp b/tools/libclang/CIndex.cpp
index 40eea39f3bdb..9cdb2ee8d697 100644
--- a/tools/libclang/CIndex.cpp
+++ b/tools/libclang/CIndex.cpp
@@ -68,13 +68,14 @@ using namespace clang::cxcursor;
 using namespace clang::cxtu;
 using namespace clang::cxindex;
 
-CXTranslationUnit cxtu::MakeCXTranslationUnit(CIndexer *CIdx, ASTUnit *AU) {
+CXTranslationUnit cxtu::MakeCXTranslationUnit(CIndexer *CIdx,
+                                              std::unique_ptr<ASTUnit> AU) {
   if (!AU)
     return nullptr;
   assert(CIdx);
   CXTranslationUnit D = new CXTranslationUnitImpl();
   D->CIdx = CIdx;
-  D->TheASTUnit = AU;
+  D->TheASTUnit = AU.release();
   D->StringPool = new cxstring::CXStringPool();
   D->Diagnostics = nullptr;
   D->OverridenCursorsPool = createOverridenCXCursorsPool();
@@ -3231,7 +3232,7 @@ enum CXErrorCode clang_createTranslationUnit2(CXIndex CIdx,
       /*CaptureDiagnostics=*/true,
       /*AllowPCHWithCompilerErrors=*/true,
       /*UserFilesAreVolatile=*/true);
-  *out_TU = MakeCXTranslationUnit(CXXIdx, AU.release());
+  *out_TU = MakeCXTranslationUnit(CXXIdx, std::move(AU));
   return *out_TU ? CXError_Success : CXError_Failure;
 }
 
@@ -3383,7 +3384,7 @@ clang_parseTranslationUnit_Impl(CXIndex CIdx, const char *source_filename,
   if (isASTReadError(Unit ? Unit.get() : ErrUnit.get()))
     return CXError_ASTReadError;
 
-  *out_TU = MakeCXTranslationUnit(CXXIdx, Unit.release());
+  *out_TU = MakeCXTranslationUnit(CXXIdx, std::move(Unit));
   return *out_TU ? CXError_Success : CXError_Failure;
 }
 
diff --git a/tools/libclang/CIndexCodeCompletion.cpp b/tools/libclang/CIndexCodeCompletion.cpp
index 12895c4a9b7a..ca68bc1cd28e 100644
--- a/tools/libclang/CIndexCodeCompletion.cpp
+++ b/tools/libclang/CIndexCodeCompletion.cpp
@@ -279,13 +279,12 @@ struct AllocatedCXCodeCompleteResults : public CXCodeCompleteResults {
   SmallVector<const llvm::MemoryBuffer *, 1> TemporaryBuffers;
   
   /// \brief Allocator used to store globally cached code-completion results.
-  IntrusiveRefCntPtr<clang::GlobalCodeCompletionAllocator>
-    CachedCompletionAllocator;
-  
+  std::shared_ptr<clang::GlobalCodeCompletionAllocator>
+      CachedCompletionAllocator;
+
   /// \brief Allocator used to store code completion results.
-  IntrusiveRefCntPtr<clang::GlobalCodeCompletionAllocator>
-    CodeCompletionAllocator;
-  
+  std::shared_ptr<clang::GlobalCodeCompletionAllocator> CodeCompletionAllocator;
+
   /// \brief Context under which completion occurred.
   enum clang::CodeCompletionContext::Kind ContextKind;
   
@@ -315,15 +314,15 @@ struct AllocatedCXCodeCompleteResults : public CXCodeCompleteResults {
 ///
 /// Used for debugging purposes only.
 static std::atomic<unsigned> CodeCompletionResultObjects;
-  
+
 AllocatedCXCodeCompleteResults::AllocatedCXCodeCompleteResults(
     IntrusiveRefCntPtr<FileManager> FileMgr)
-    : CXCodeCompleteResults(),
-      DiagOpts(new DiagnosticOptions),
+    : CXCodeCompleteResults(), DiagOpts(new DiagnosticOptions),
       Diag(new DiagnosticsEngine(
           IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs), &*DiagOpts)),
       FileMgr(FileMgr), SourceMgr(new SourceManager(*Diag, *FileMgr)),
-      CodeCompletionAllocator(new clang::GlobalCodeCompletionAllocator),
+      CodeCompletionAllocator(
+          std::make_shared<clang::GlobalCodeCompletionAllocator>()),
       Contexts(CXCompletionContext_Unknown),
       ContainerKind(CXCursor_InvalidCode), ContainerIsIncomplete(1) {
   if (getenv("LIBCLANG_OBJTRACKING"))
diff --git a/tools/libclang/CXIndexDataConsumer.cpp b/tools/libclang/CXIndexDataConsumer.cpp
index 45198dd1b168..1981cabbbe4c 100644
--- a/tools/libclang/CXIndexDataConsumer.cpp
+++ b/tools/libclang/CXIndexDataConsumer.cpp
@@ -410,8 +410,8 @@ void CXIndexDataConsumer::setASTContext(ASTContext &ctx) {
   cxtu::getASTUnit(CXTU)->setASTContext(&ctx);
 }
 
-void CXIndexDataConsumer::setPreprocessor(Preprocessor &PP) {
-  cxtu::getASTUnit(CXTU)->setPreprocessor(&PP);
+void CXIndexDataConsumer::setPreprocessor(std::shared_ptr<Preprocessor> PP) {
+  cxtu::getASTUnit(CXTU)->setPreprocessor(std::move(PP));
 }
 
 bool CXIndexDataConsumer::isFunctionLocalDecl(const Decl *D) {
diff --git a/tools/libclang/CXIndexDataConsumer.h b/tools/libclang/CXIndexDataConsumer.h
index 406831f1ddce..718a2a18b1b3 100644
--- a/tools/libclang/CXIndexDataConsumer.h
+++ b/tools/libclang/CXIndexDataConsumer.h
@@ -342,7 +342,7 @@ public:
   CXTranslationUnit getCXTU() const { return CXTU; }
 
   void setASTContext(ASTContext &ctx);
-  void setPreprocessor(Preprocessor &PP);
+  void setPreprocessor(std::shared_ptr<Preprocessor> PP);
 
   bool shouldSuppressRefs() const {
     return IndexOptions & CXIndexOpt_SuppressRedundantRefs;
diff --git a/tools/libclang/CXTranslationUnit.h b/tools/libclang/CXTranslationUnit.h
index 6022c9dab1b5..67c31d2dba4f 100644
--- a/tools/libclang/CXTranslationUnit.h
+++ b/tools/libclang/CXTranslationUnit.h
@@ -38,7 +38,8 @@ struct CXTranslationUnitImpl {
 namespace clang {
 namespace cxtu {
 
-CXTranslationUnitImpl *MakeCXTranslationUnit(CIndexer *CIdx, ASTUnit *AU);
+CXTranslationUnitImpl *MakeCXTranslationUnit(CIndexer *CIdx,
+                                             std::unique_ptr<ASTUnit> AU);
 
 static inline ASTUnit *getASTUnit(CXTranslationUnit TU) {
   if (!TU)
diff --git a/tools/libclang/Indexing.cpp b/tools/libclang/Indexing.cpp
index c18b5402aa71..f98b25887973 100644
--- a/tools/libclang/Indexing.cpp
+++ b/tools/libclang/Indexing.cpp
@@ -371,7 +371,7 @@ public:
     DataConsumer->setASTContext(CI.getASTContext());
     Preprocessor &PP = CI.getPreprocessor();
     PP.addPPCallbacks(llvm::make_unique<IndexPPCallbacks>(PP, *DataConsumer));
-    DataConsumer->setPreprocessor(PP);
+    DataConsumer->setPreprocessor(CI.getPreprocessorPtr());
 
     if (SKData) {
       auto *PPRec = new PPConditionalDirectiveRecord(PP.getSourceManager());
@@ -476,17 +476,19 @@ static CXErrorCode clang_indexSourceFile_Impl(
   // present it will be unused.
   if (source_filename)
     Args->push_back(source_filename);
-  
-  IntrusiveRefCntPtr<CompilerInvocation>
-    CInvok(createInvocationFromCommandLine(*Args, Diags));
+
+  std::shared_ptr<CompilerInvocation> CInvok =
+      createInvocationFromCommandLine(*Args, Diags);
 
   if (!CInvok)
     return CXError_Failure;
 
   // Recover resources if we crash before exiting this function.
-  llvm::CrashRecoveryContextCleanupRegistrar<CompilerInvocation,
-    llvm::CrashRecoveryContextReleaseRefCleanup<CompilerInvocation> >
-    CInvokCleanup(CInvok.get());
+  llvm::CrashRecoveryContextCleanupRegistrar<
+      std::shared_ptr<CompilerInvocation>,
+      llvm::CrashRecoveryContextDestructorCleanup<
+          std::shared_ptr<CompilerInvocation>>>
+      CInvokCleanup(&CInvok);
 
   if (CInvok->getFrontendOpts().Inputs.empty())
     return CXError_Failure;
@@ -518,13 +520,14 @@ static CXErrorCode clang_indexSourceFile_Impl(
   CInvok->getHeaderSearchOpts().ModuleFormat =
     CXXIdx->getPCHContainerOperations()->getRawReader().getFormat();
 
-  ASTUnit *Unit = ASTUnit::create(CInvok.get(), Diags, CaptureDiagnostics,
-                                  /*UserFilesAreVolatile=*/true);
+  auto Unit = ASTUnit::create(CInvok, Diags, CaptureDiagnostics,
+                              /*UserFilesAreVolatile=*/true);
   if (!Unit)
     return CXError_InvalidArguments;
 
+  auto *UPtr = Unit.get();
   std::unique_ptr<CXTUOwner> CXTU(
-      new CXTUOwner(MakeCXTranslationUnit(CXXIdx, Unit)));
+      new CXTUOwner(MakeCXTranslationUnit(CXXIdx, std::move(Unit))));
 
   // Recover resources if we crash before exiting this method.
   llvm::CrashRecoveryContextCleanupRegistrar<CXTUOwner>
@@ -583,16 +586,16 @@ static CXErrorCode clang_indexSourceFile_Impl(
       !PrecompilePreamble ? 0 : 2 - CreatePreambleOnFirstParse;
   DiagnosticErrorTrap DiagTrap(*Diags);
   bool Success = ASTUnit::LoadFromCompilerInvocationAction(
-      CInvok.get(), CXXIdx->getPCHContainerOperations(), Diags,
-      IndexAction.get(), Unit, Persistent, CXXIdx->getClangResourcesPath(),
+      std::move(CInvok), CXXIdx->getPCHContainerOperations(), Diags,
+      IndexAction.get(), UPtr, Persistent, CXXIdx->getClangResourcesPath(),
       OnlyLocalDecls, CaptureDiagnostics, PrecompilePreambleAfterNParses,
       CacheCodeCompletionResults,
       /*IncludeBriefCommentsInCodeCompletion=*/false,
       /*UserFilesAreVolatile=*/true);
   if (DiagTrap.hasErrorOccurred() && CXXIdx->getDisplayDiagnostics())
-    printDiagsToStderr(Unit);
+    printDiagsToStderr(UPtr);
 
-  if (isASTReadError(Unit))
+  if (isASTReadError(UPtr))
     return CXError_ASTReadError;
 
   if (!Success)
diff --git a/unittests/AST/ExternalASTSourceTest.cpp b/unittests/AST/ExternalASTSourceTest.cpp
index 4b3bb3e2b69b..513ff5b99fad 100644
--- a/unittests/AST/ExternalASTSourceTest.cpp
+++ b/unittests/AST/ExternalASTSourceTest.cpp
@@ -49,14 +49,14 @@ bool testExternalASTSource(ExternalASTSource *Source,
   CompilerInstance Compiler;
   Compiler.createDiagnostics();
 
-  CompilerInvocation *Invocation = new CompilerInvocation;
+  auto Invocation = std::make_shared<CompilerInvocation>();
   Invocation->getPreprocessorOpts().addRemappedFile(
       "test.cc", MemoryBuffer::getMemBuffer(FileContents).release());
   const char *Args[] = { "test.cc" };
   CompilerInvocation::CreateFromArgs(*Invocation, Args,
                                      Args + array_lengthof(Args),
                                      Compiler.getDiagnostics());
-  Compiler.setInvocation(Invocation);
+  Compiler.setInvocation(std::move(Invocation));
 
   TestFrontendAction Action(Source);
   return Compiler.ExecuteAction(Action);
diff --git a/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index 67a4a3b2fc09..5957c7fa41da 100644
--- a/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ b/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -222,9 +222,12 @@ TEST(HasDeclaration, HasDeclarationOfEnumType) {
 }
 
 TEST(HasDeclaration, HasGetDeclTraitTest) {
-  EXPECT_TRUE(internal::has_getDecl<TypedefType>::value);
-  EXPECT_TRUE(internal::has_getDecl<RecordType>::value);
-  EXPECT_FALSE(internal::has_getDecl<TemplateSpecializationType>::value);
+  static_assert(internal::has_getDecl<TypedefType>::value,
+                "Expected TypedefType to have a getDecl.");
+  static_assert(internal::has_getDecl<RecordType>::value,
+                "Expected RecordType to have a getDecl.");
+  static_assert(!internal::has_getDecl<TemplateSpecializationType>::value,
+                "Expected TemplateSpecializationType to *not* have a getDecl.");
 }
 
 TEST(HasDeclaration, HasDeclarationOfTypeWithDecl) {
diff --git a/unittests/Basic/SourceManagerTest.cpp b/unittests/Basic/SourceManagerTest.cpp
index f41876147cdd..a967b0ec7c21 100644
--- a/unittests/Basic/SourceManagerTest.cpp
+++ b/unittests/Basic/SourceManagerTest.cpp
@@ -78,10 +78,10 @@ TEST_F(SourceManagerTest, isBeforeInTranslationUnit) {
   SourceMgr.setMainFileID(mainFileID);
 
   VoidModuleLoader ModLoader;
-  HeaderSearch HeaderInfo(new HeaderSearchOptions, SourceMgr, Diags, LangOpts, 
-                          &*Target);
-  Preprocessor PP(new PreprocessorOptions(), Diags, LangOpts, SourceMgr,
-                  HeaderInfo, ModLoader,
+  HeaderSearch HeaderInfo(std::make_shared<HeaderSearchOptions>(), SourceMgr,
+                          Diags, LangOpts, &*Target);
+  Preprocessor PP(std::make_shared<PreprocessorOptions>(), Diags, LangOpts,
+                  SourceMgr, HeaderInfo, ModLoader,
                   /*IILookup =*/nullptr,
                   /*OwnsHeaderSearch =*/false);
   PP.Initialize(*Target);
@@ -198,10 +198,10 @@ TEST_F(SourceManagerTest, getMacroArgExpandedLocation) {
   SourceMgr.overrideFileContents(headerFile, std::move(HeaderBuf));
 
   VoidModuleLoader ModLoader;
-  HeaderSearch HeaderInfo(new HeaderSearchOptions, SourceMgr, Diags, LangOpts, 
-                          &*Target);
-  Preprocessor PP(new PreprocessorOptions(), Diags, LangOpts, SourceMgr,
-                  HeaderInfo, ModLoader,
+  HeaderSearch HeaderInfo(std::make_shared<HeaderSearchOptions>(), SourceMgr,
+                          Diags, LangOpts, &*Target);
+  Preprocessor PP(std::make_shared<PreprocessorOptions>(), Diags, LangOpts,
+                  SourceMgr, HeaderInfo, ModLoader,
                   /*IILookup =*/nullptr,
                   /*OwnsHeaderSearch =*/false);
   PP.Initialize(*Target);
@@ -298,10 +298,10 @@ TEST_F(SourceManagerTest, isBeforeInTranslationUnitWithMacroInInclude) {
   SourceMgr.overrideFileContents(headerFile, std::move(HeaderBuf));
 
   VoidModuleLoader ModLoader;
-  HeaderSearch HeaderInfo(new HeaderSearchOptions, SourceMgr, Diags, LangOpts, 
-                          &*Target);
-  Preprocessor PP(new PreprocessorOptions(), Diags, LangOpts, SourceMgr,
-                  HeaderInfo, ModLoader,
+  HeaderSearch HeaderInfo(std::make_shared<HeaderSearchOptions>(), SourceMgr,
+                          Diags, LangOpts, &*Target);
+  Preprocessor PP(std::make_shared<PreprocessorOptions>(), Diags, LangOpts,
+                  SourceMgr, HeaderInfo, ModLoader,
                   /*IILookup =*/nullptr,
                   /*OwnsHeaderSearch =*/false);
   PP.Initialize(*Target);
diff --git a/unittests/Format/FormatTestJS.cpp b/unittests/Format/FormatTestJS.cpp
index 90c99317bd79..59f4a4f6dcfe 100644
--- a/unittests/Format/FormatTestJS.cpp
+++ b/unittests/Format/FormatTestJS.cpp
@@ -541,8 +541,8 @@ TEST_F(FormatTestJS, FunctionLiterals) {
                "      foo();\n"
                "      bar();\n"
                "    },\n"
-               "    this, arg1IsReallyLongAndNeeedsLineBreaks,\n"
-               "    arg3IsReallyLongAndNeeedsLineBreaks);");
+               "    this, arg1IsReallyLongAndNeedsLineBreaks,\n"
+               "    arg3IsReallyLongAndNeedsLineBreaks);");
   verifyFormat("var closure = goog.bind(function() {  // comment\n"
                "  foo();\n"
                "  bar();\n"
diff --git a/unittests/Frontend/CodeGenActionTest.cpp b/unittests/Frontend/CodeGenActionTest.cpp
index 356b5130fcbe..1d2a50c8bc20 100644
--- a/unittests/Frontend/CodeGenActionTest.cpp
+++ b/unittests/Frontend/CodeGenActionTest.cpp
@@ -41,7 +41,7 @@ public:
 
 
 TEST(CodeGenTest, TestNullCodeGen) {
-  CompilerInvocation *Invocation = new CompilerInvocation;
+  auto Invocation = std::make_shared<CompilerInvocation>();
   Invocation->getPreprocessorOpts().addRemappedFile(
       "test.cc",
       MemoryBuffer::getMemBuffer("").release());
@@ -50,7 +50,7 @@ TEST(CodeGenTest, TestNullCodeGen) {
   Invocation->getFrontendOpts().ProgramAction = EmitLLVM;
   Invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance Compiler;
-  Compiler.setInvocation(Invocation);
+  Compiler.setInvocation(std::move(Invocation));
   Compiler.createDiagnostics();
   EXPECT_TRUE(Compiler.hasDiagnostics());
 
diff --git a/unittests/Frontend/FrontendActionTest.cpp b/unittests/Frontend/FrontendActionTest.cpp
index c3e6adb6324d..dd6be5fd4b98 100644
--- a/unittests/Frontend/FrontendActionTest.cpp
+++ b/unittests/Frontend/FrontendActionTest.cpp
@@ -79,7 +79,7 @@ private:
 };
 
 TEST(ASTFrontendAction, Sanity) {
-  CompilerInvocation *invocation = new CompilerInvocation;
+  auto invocation = std::make_shared<CompilerInvocation>();
   invocation->getPreprocessorOpts().addRemappedFile(
       "test.cc",
       MemoryBuffer::getMemBuffer("int main() { float x; }").release());
@@ -88,7 +88,7 @@ TEST(ASTFrontendAction, Sanity) {
   invocation->getFrontendOpts().ProgramAction = frontend::ParseSyntaxOnly;
   invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance compiler;
-  compiler.setInvocation(invocation);
+  compiler.setInvocation(std::move(invocation));
   compiler.createDiagnostics();
 
   TestASTFrontendAction test_action;
@@ -99,7 +99,7 @@ TEST(ASTFrontendAction, Sanity) {
 }
 
 TEST(ASTFrontendAction, IncrementalParsing) {
-  CompilerInvocation *invocation = new CompilerInvocation;
+  auto invocation = std::make_shared<CompilerInvocation>();
   invocation->getPreprocessorOpts().addRemappedFile(
       "test.cc",
       MemoryBuffer::getMemBuffer("int main() { float x; }").release());
@@ -108,7 +108,7 @@ TEST(ASTFrontendAction, IncrementalParsing) {
   invocation->getFrontendOpts().ProgramAction = frontend::ParseSyntaxOnly;
   invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance compiler;
-  compiler.setInvocation(invocation);
+  compiler.setInvocation(std::move(invocation));
   compiler.createDiagnostics();
 
   TestASTFrontendAction test_action(/*enableIncrementalProcessing=*/true);
@@ -119,7 +119,7 @@ TEST(ASTFrontendAction, IncrementalParsing) {
 }
 
 TEST(ASTFrontendAction, LateTemplateIncrementalParsing) {
-  CompilerInvocation *invocation = new CompilerInvocation;
+  auto invocation = std::make_shared<CompilerInvocation>();
   invocation->getLangOpts()->CPlusPlus = true;
   invocation->getLangOpts()->DelayedTemplateParsing = true;
   invocation->getPreprocessorOpts().addRemappedFile(
@@ -135,7 +135,7 @@ TEST(ASTFrontendAction, LateTemplateIncrementalParsing) {
   invocation->getFrontendOpts().ProgramAction = frontend::ParseSyntaxOnly;
   invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance compiler;
-  compiler.setInvocation(invocation);
+  compiler.setInvocation(std::move(invocation));
   compiler.createDiagnostics();
 
   TestASTFrontendAction test_action(/*enableIncrementalProcessing=*/true,
@@ -172,7 +172,7 @@ public:
 };
 
 TEST(PreprocessorFrontendAction, EndSourceFile) {
-  CompilerInvocation *Invocation = new CompilerInvocation;
+  auto Invocation = std::make_shared<CompilerInvocation>();
   Invocation->getPreprocessorOpts().addRemappedFile(
       "test.cc",
       MemoryBuffer::getMemBuffer("int main() { float x; }").release());
@@ -181,7 +181,7 @@ TEST(PreprocessorFrontendAction, EndSourceFile) {
   Invocation->getFrontendOpts().ProgramAction = frontend::ParseSyntaxOnly;
   Invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance Compiler;
-  Compiler.setInvocation(Invocation);
+  Compiler.setInvocation(std::move(Invocation));
   Compiler.createDiagnostics();
 
   TestPPCallbacks *Callbacks = new TestPPCallbacks;
@@ -231,7 +231,7 @@ struct TypoDiagnosticConsumer : public DiagnosticConsumer {
 };
 
 TEST(ASTFrontendAction, ExternalSemaSource) {
-  auto *Invocation = new CompilerInvocation;
+  auto Invocation = std::make_shared<CompilerInvocation>();
   Invocation->getLangOpts()->CPlusPlus = true;
   Invocation->getPreprocessorOpts().addRemappedFile(
       "test.cc", MemoryBuffer::getMemBuffer("void fooo();\n"
@@ -242,7 +242,7 @@ TEST(ASTFrontendAction, ExternalSemaSource) {
   Invocation->getFrontendOpts().ProgramAction = frontend::ParseSyntaxOnly;
   Invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
   CompilerInstance Compiler;
-  Compiler.setInvocation(Invocation);
+  Compiler.setInvocation(std::move(Invocation));
   auto *TDC = new TypoDiagnosticConsumer;
   Compiler.createDiagnostics(TDC, /*ShouldOwnClient=*/true);
   Compiler.setExternalSemaSource(new TypoExternalSemaSource(Compiler));
diff --git a/unittests/Lex/LexerTest.cpp b/unittests/Lex/LexerTest.cpp
index 204601818152..918167bf43c5 100644
--- a/unittests/Lex/LexerTest.cpp
+++ b/unittests/Lex/LexerTest.cpp
@@ -64,10 +64,10 @@ protected:
     SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
 
     VoidModuleLoader ModLoader;
-    HeaderSearch HeaderInfo(new HeaderSearchOptions, SourceMgr, Diags, LangOpts,
-                            Target.get());
-    Preprocessor PP(new PreprocessorOptions(), Diags, LangOpts, SourceMgr,
-                    HeaderInfo, ModLoader, /*IILookup =*/nullptr,
+    HeaderSearch HeaderInfo(std::make_shared<HeaderSearchOptions>(), SourceMgr,
+                            Diags, LangOpts, Target.get());
+    Preprocessor PP(std::make_shared<PreprocessorOptions>(), Diags, LangOpts,
+                    SourceMgr, HeaderInfo, ModLoader, /*IILookup =*/nullptr,
                     /*OwnsHeaderSearch =*/false);
     PP.Initialize(*Target);
     PP.EnterMainSourceFile();
diff --git a/unittests/Lex/PPCallbacksTest.cpp b/unittests/Lex/PPCallbacksTest.cpp
index cbce5c6e1676..064abafc4a88 100644
--- a/unittests/Lex/PPCallbacksTest.cpp
+++ b/unittests/Lex/PPCallbacksTest.cpp
@@ -162,13 +162,12 @@ protected:
 
     VoidModuleLoader ModLoader;
 
-    IntrusiveRefCntPtr<HeaderSearchOptions> HSOpts = new HeaderSearchOptions();
-    HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts,
-                            Target.get());
+    HeaderSearch HeaderInfo(std::make_shared<HeaderSearchOptions>(), SourceMgr,
+                            Diags, LangOpts, Target.get());
     AddFakeHeader(HeaderInfo, HeaderPath, SystemHeader);
 
-    IntrusiveRefCntPtr<PreprocessorOptions> PPOpts = new PreprocessorOptions();
-    Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader,
+    Preprocessor PP(std::make_shared<PreprocessorOptions>(), Diags, LangOpts,
+                    SourceMgr, HeaderInfo, ModLoader,
                     /*IILookup =*/nullptr,
                     /*OwnsHeaderSearch =*/false);
     PP.Initialize(*Target);
@@ -199,11 +198,12 @@ protected:
     SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(SourceBuf)));
 
     VoidModuleLoader ModLoader;
-    HeaderSearch HeaderInfo(new HeaderSearchOptions, SourceMgr, Diags, 
-                            OpenCLLangOpts, Target.get());
+    HeaderSearch HeaderInfo(std::make_shared<HeaderSearchOptions>(), SourceMgr,
+                            Diags, OpenCLLangOpts, Target.get());
 
-    Preprocessor PP(new PreprocessorOptions(), Diags, OpenCLLangOpts, SourceMgr,
-                    HeaderInfo, ModLoader, /*IILookup =*/nullptr,
+    Preprocessor PP(std::make_shared<PreprocessorOptions>(), Diags,
+                    OpenCLLangOpts, SourceMgr, HeaderInfo, ModLoader,
+                    /*IILookup =*/nullptr,
                     /*OwnsHeaderSearch =*/false);
     PP.Initialize(*Target);
 
diff --git a/unittests/Lex/PPConditionalDirectiveRecordTest.cpp b/unittests/Lex/PPConditionalDirectiveRecordTest.cpp
index bceeac57ea61..dccfffdb2c15 100644
--- a/unittests/Lex/PPConditionalDirectiveRecordTest.cpp
+++ b/unittests/Lex/PPConditionalDirectiveRecordTest.cpp
@@ -93,10 +93,10 @@ TEST_F(PPConditionalDirectiveRecordTest, PPRecAPI) {
   SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
 
   VoidModuleLoader ModLoader;
-  HeaderSearch HeaderInfo(new HeaderSearchOptions, SourceMgr, Diags, LangOpts, 
-                          Target.get());
-  Preprocessor PP(new PreprocessorOptions(), Diags, LangOpts, SourceMgr,
-                  HeaderInfo, ModLoader,
+  HeaderSearch HeaderInfo(std::make_shared<HeaderSearchOptions>(), SourceMgr,
+                          Diags, LangOpts, Target.get());
+  Preprocessor PP(std::make_shared<PreprocessorOptions>(), Diags, LangOpts,
+                  SourceMgr, HeaderInfo, ModLoader,
                   /*IILookup =*/nullptr,
                   /*OwnsHeaderSearch =*/false);
   PP.Initialize(*Target);
diff --git a/utils/TableGen/ClangAttrEmitter.cpp b/utils/TableGen/ClangAttrEmitter.cpp
index d65794e86374..27ab34c1309d 100644
--- a/utils/TableGen/ClangAttrEmitter.cpp
+++ b/utils/TableGen/ClangAttrEmitter.cpp
@@ -133,10 +133,9 @@ static StringRef NormalizeNameForSpellingComparison(StringRef Name) {
   return Name.trim("_");
 }
 
-// Normalize attribute spelling only if the spelling has both leading
-// and trailing underscores. For example, __ms_struct__ will be 
-// normalized to "ms_struct"; __cdecl will remain intact.
-static StringRef NormalizeAttrSpelling(StringRef AttrSpelling) {
+// Normalize the spelling of a GNU attribute (i.e. "x" in "__attribute__((x))"),
+// removing "__" if it appears at the beginning and end of the attribute's name.
+static StringRef NormalizeGNUAttrSpelling(StringRef AttrSpelling) {
   if (AttrSpelling.startswith("__") && AttrSpelling.endswith("__")) {
     AttrSpelling = AttrSpelling.substr(2, AttrSpelling.size() - 4);
   }
@@ -3045,7 +3044,11 @@ void EmitClangAttrParsedAttrKinds(RecordKeeper &Records, raw_ostream &OS) {
 
         assert(Matches && "Unsupported spelling variety found");
 
-        Spelling += NormalizeAttrSpelling(RawSpelling);
+        if (Variety == "GNU")
+          Spelling += NormalizeGNUAttrSpelling(RawSpelling);
+        else
+          Spelling += RawSpelling;
+
         if (SemaHandler)
           Matches->push_back(StringMatcher::StringPair(Spelling,
                               "return AttributeList::AT_" + AttrName + ";"));
-- 
cgit v1.2.3


From 909545a822eef491158f831688066f0ec2866938 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Mon, 9 Jan 2017 21:23:09 +0000
Subject: Vendor import of llvm trunk r291476:
 https://llvm.org/svn/llvm-project/llvm/trunk@291476

---
 cmake/config-ix.cmake                              |   13 +-
 cmake/modules/AddLLVM.cmake                        |   17 +-
 include/llvm/Analysis/ScalarEvolution.h            |    2 +
 include/llvm/Analysis/TargetLibraryInfo.h          |    4 +-
 include/llvm/CodeGen/MachineBasicBlock.h           |   10 +
 include/llvm/CodeGen/MachineFrameInfo.h            |    3 +-
 include/llvm/DebugInfo/MSF/StreamArray.h           |  111 +-
 .../ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h    |    4 +-
 include/llvm/ExecutionEngine/Orc/RPCUtils.h        |  246 +-
 include/llvm/ExecutionEngine/Orc/RawByteChannel.h  |    4 +-
 include/llvm/IR/ModuleSummaryIndexYAML.h           |   12 +-
 include/llvm/IR/PassManager.h                      |  127 +-
 include/llvm/IR/User.h                             |   20 +
 include/llvm/Support/Path.h                        |    8 +
 include/llvm/Transforms/IPO.h                      |   13 +-
 include/llvm/Transforms/IPO/PassManagerBuilder.h   |    1 -
 lib/Analysis/InstructionSimplify.cpp               |   20 +
 lib/Analysis/LoopInfo.cpp                          |    6 +-
 lib/Analysis/MemoryDependenceAnalysis.cpp          |   42 +-
 lib/Analysis/ScalarEvolution.cpp                   |   12 +
 lib/Analysis/ValueTracking.cpp                     |    1 +
 lib/Bitcode/Reader/MetadataLoader.cpp              |   13 +-
 lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp  |    4 +-
 lib/CodeGen/StackSlotColoring.cpp                  |   11 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp |   44 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h   |    3 +
 lib/LTO/ThinLTOCodeGenerator.cpp                   |    9 +-
 lib/Object/MachOObjectFile.cpp                     |    8 +
 lib/Object/ModuleSummaryIndexObjectFile.cpp        |    8 +
 lib/Support/CommandLine.cpp                        |    2 +-
 lib/Support/Path.cpp                               |   10 +
 lib/Support/TarWriter.cpp                          |   42 +-
 lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp           |    4 +
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp           |   10 -
 lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp    |   52 +-
 lib/Target/AMDGPU/R600ISelLowering.cpp             |  281 +-
 lib/Target/AMDGPU/R600Instructions.td              |   11 +
 lib/Target/AMDGPU/SIISelLowering.cpp               |   39 +-
 lib/Target/AMDGPU/SIISelLowering.h                 |    3 +-
 lib/Target/AVR/AVRISelDAGToDAG.cpp                 |    4 +-
 lib/Target/AVR/AVRISelLowering.cpp                 |   41 +
 lib/Target/AVR/AVRISelLowering.h                   |    3 +
 lib/Target/BPF/BPFInstrInfo.cpp                    |   16 +-
 lib/Target/BPF/Disassembler/BPFDisassembler.cpp    |   12 +-
 lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp      |   20 +-
 lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp |   11 +-
 lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp   |   19 +-
 lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp    |    9 +-
 lib/Target/TargetMachineC.cpp                      |    4 +-
 lib/Target/WebAssembly/CMakeLists.txt              |    1 +
 lib/Target/WebAssembly/WebAssembly.h               |    1 +
 .../WebAssembly/WebAssemblyFixFunctionBitcasts.cpp |  159 +
 lib/Target/WebAssembly/WebAssemblyInstrInteger.td  |    4 +-
 .../WebAssembly/WebAssemblyTargetMachine.cpp       |    4 +
 lib/Target/X86/X86ISelLowering.cpp                 |  264 +-
 lib/Target/X86/X86InstrAVX512.td                   |  247 +-
 lib/Target/X86/X86InstrInfo.cpp                    |   19 +-
 lib/Target/X86/X86InstrSSE.td                      |    2 +-
 lib/Target/X86/X86TargetTransformInfo.cpp          |  291 +-
 lib/Transforms/IPO/LowerTypeTests.cpp              |  109 +-
 lib/Transforms/IPO/PassManagerBuilder.cpp          |    3 +-
 lib/Transforms/InstCombine/InstCombineCompares.cpp |   10 +-
 .../Instrumentation/AddressSanitizer.cpp           |    1 +
 lib/Transforms/Scalar/IndVarSimplify.cpp           |    2 +-
 lib/Transforms/Scalar/LoopLoadElimination.cpp      |    4 +-
 lib/Transforms/Scalar/LoopUnswitch.cpp             |    2 +-
 lib/Transforms/Scalar/NewGVN.cpp                   |  192 +-
 lib/Transforms/Scalar/SCCP.cpp                     |   18 -
 lib/Transforms/Utils/FunctionImportUtils.cpp       |   15 +-
 lib/Transforms/Utils/SimplifyLibCalls.cpp          |   12 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp         |   34 +-
 test/Analysis/CostModel/X86/shuffle-reverse.ll     |    2 +-
 test/Analysis/CostModel/X86/testshiftlshr.ll       |    4 +-
 test/Analysis/CostModel/X86/testshiftshl.ll        |    4 +-
 test/Analysis/CostModel/X86/vshift-ashr-cost.ll    |   45 +-
 test/Analysis/CostModel/X86/vshift-lshr-cost.ll    |   66 +-
 test/Analysis/CostModel/X86/vshift-shl-cost.ll     |   70 +-
 test/Analysis/ScalarEvolution/invalidation.ll      |   70 +
 test/Analysis/ValueTracking/assume.ll              |   22 +-
 test/Bindings/Go/lit.local.cfg                     |    2 +-
 test/Bindings/OCaml/lit.local.cfg                  |    2 +-
 test/CMakeLists.txt                                |   14 +-
 test/CodeGen/AMDGPU/load-constant-i16.ll           |  138 +-
 test/CodeGen/AMDGPU/load-global-i16.ll             |  331 +-
 test/CodeGen/AMDGPU/min.ll                         |  172 +-
 test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll      |   16 +
 test/CodeGen/AMDGPU/store-private.ll               |  743 +++
 test/CodeGen/AVR/intrinsics/read_register.ll       |   17 +
 test/CodeGen/WebAssembly/function-bitcasts.ll      |   56 +
 .../WebAssembly/unsupported-function-bitcasts.ll   |   26 +
 test/CodeGen/X86/avx2-arith.ll                     |  101 +-
 test/CodeGen/X86/avx512-bugfix-23634.ll            |    2 +-
 test/CodeGen/X86/avx512-calling-conv.ll            |   24 +-
 test/CodeGen/X86/avx512-cvt.ll                     |   14 +-
 test/CodeGen/X86/avx512-ext.ll                     |   33 +-
 test/CodeGen/X86/avx512-insert-extract.ll          |   56 +-
 test/CodeGen/X86/avx512-mask-op.ll                 |  110 +-
 test/CodeGen/X86/avx512-mov.ll                     |   16 +-
 test/CodeGen/X86/avx512-regcall-NoMask.ll          |   30 +-
 test/CodeGen/X86/avx512-vbroadcast.ll              |    3 +-
 test/CodeGen/X86/avx512-vec-cmp.ll                 |  141 +-
 test/CodeGen/X86/avx512bw-mov.ll                   |    4 +-
 test/CodeGen/X86/avx512bw-vec-cmp.ll               |   36 +-
 test/CodeGen/X86/avx512bwvl-mov.ll                 |    8 +-
 test/CodeGen/X86/avx512bwvl-vec-cmp.ll             |   72 +-
 test/CodeGen/X86/avx512vl-mov.ll                   |   32 +-
 test/CodeGen/X86/avx512vl-vec-cmp.ll               |  144 +-
 test/CodeGen/X86/cmov.ll                           |    6 +-
 test/CodeGen/X86/fma-fneg-combine.ll               |   12 +-
 test/CodeGen/X86/fmaddsub-combine.ll               |  129 +
 test/CodeGen/X86/sse-fsignum.ll                    |   11 +-
 test/CodeGen/X86/vector-compare-results.ll         | 6208 +++++++++++++++-----
 test/CodeGen/X86/vector-sext.ll                    |   45 +-
 test/CodeGen/X86/vector-shift-ashr-128.ll          |  130 +-
 test/CodeGen/X86/vector-shift-ashr-256.ll          |  234 +-
 test/CodeGen/X86/vector-shift-ashr-512.ll          |   52 +-
 test/CodeGen/X86/vector-shift-lshr-128.ll          |   94 +-
 test/CodeGen/X86/vector-shift-lshr-256.ll          |  162 +-
 test/CodeGen/X86/vector-shift-lshr-512.ll          |   52 +-
 test/CodeGen/X86/vector-shift-shl-128.ll           |   88 +-
 test/CodeGen/X86/vector-shift-shl-256.ll           |  154 +-
 test/CodeGen/X86/vector-shift-shl-512.ll           |   27 +-
 test/CodeGen/X86/vector-shuffle-512-v64.ll         |    9 +-
 test/CodeGen/X86/vector-shuffle-masked.ll          |   33 +-
 test/CodeGen/X86/vector-shuffle-v1.ll              |   74 +-
 test/ExecutionEngine/Interpreter/lit.local.cfg     |    2 +-
 .../RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s |   11 +-
 .../RuntimeDyld/AArch64/ELF_ARM64_local_branch.s   |   14 +
 .../RuntimeDyld/AArch64/ELF_ARM64_relocations.s    |   35 +-
 .../AddressSanitizer/global_metadata_darwin.ll     |    2 +-
 test/JitListener/lit.local.cfg                     |    2 +-
 test/ThinLTO/X86/Inputs/funcimport-tbaa.ll         |   11 +
 test/ThinLTO/X86/Inputs/local_name_conflict1.ll    |   17 +
 test/ThinLTO/X86/Inputs/local_name_conflict2.ll    |   17 +
 test/ThinLTO/X86/funcimport-tbaa.ll                |   38 +
 test/ThinLTO/X86/local_name_conflict.ll            |   29 +
 test/Transforms/GVN/invariant.group.ll             |   52 +
 test/Transforms/InstCombine/assume.ll              |   45 +-
 test/Transforms/InstCombine/assume2.ll             |  141 +-
 test/Transforms/InstCombine/fabs.ll                |   42 +-
 test/Transforms/InstCombine/fast-math.ll           |    6 +-
 test/Transforms/InstCombine/urem-simplify-bug.ll   |   52 +-
 test/Transforms/InstSimplify/div.ll                |   15 +
 test/Transforms/InstSimplify/rem.ll                |   14 +
 test/Transforms/LICM/hoisting.ll                   |   27 +
 test/Transforms/LoopLoadElim/forward.ll            |    6 +-
 test/Transforms/LoopVectorize/iv_outside_user.ll   |   45 +
 test/Transforms/NewGVN/basic-cyclic-opt.ll         |  235 +
 test/Transforms/NewGVN/cyclic-phi-handling.ll      |   37 +
 test/Transforms/NewGVN/invariant.group.ll          |   52 +
 test/Transforms/NewGVN/memory-handling.ll          |  195 +
 test/Transforms/NewGVN/pr31501.ll                  |  136 +
 test/Transforms/NewGVN/pr31573.ll                  |   42 +
 test/lit.cfg                                       |   10 +-
 test/lit.site.cfg.in                               |   18 +-
 test/tools/llvm-config/system-libs.test            |    3 +-
 test/tools/llvm-config/system-libs.windows.test    |    3 +-
 test/tools/llvm-opt-report/Inputs/dm.c             |   13 +
 test/tools/llvm-opt-report/Inputs/dm.yaml          |  104 +
 test/tools/llvm-opt-report/func-dm.test            |   17 +
 tools/llvm-config/llvm-config.cpp                  |    8 +-
 tools/llvm-objdump/MachODump.cpp                   |   31 +-
 tools/llvm-opt-report/OptReport.cpp                |   14 +-
 unittests/ExecutionEngine/Orc/RPCUtilsTest.cpp     |   59 +-
 unittests/IR/UserTest.cpp                          |   25 +
 utils/unittest/CMakeLists.txt                      |    4 +-
 utils/update_test_checks.py                        |   51 +-
 167 files changed, 10583 insertions(+), 4006 deletions(-)
 create mode 100644 lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
 create mode 100644 test/Analysis/ScalarEvolution/invalidation.ll
 create mode 100644 test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
 create mode 100644 test/CodeGen/AMDGPU/store-private.ll
 create mode 100644 test/CodeGen/AVR/intrinsics/read_register.ll
 create mode 100644 test/CodeGen/WebAssembly/function-bitcasts.ll
 create mode 100644 test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
 create mode 100644 test/CodeGen/X86/fmaddsub-combine.ll
 create mode 100644 test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_local_branch.s
 create mode 100644 test/ThinLTO/X86/Inputs/funcimport-tbaa.ll
 create mode 100644 test/ThinLTO/X86/Inputs/local_name_conflict1.ll
 create mode 100644 test/ThinLTO/X86/Inputs/local_name_conflict2.ll
 create mode 100644 test/ThinLTO/X86/funcimport-tbaa.ll
 create mode 100644 test/ThinLTO/X86/local_name_conflict.ll
 create mode 100644 test/Transforms/InstSimplify/div.ll
 create mode 100644 test/Transforms/NewGVN/basic-cyclic-opt.ll
 create mode 100644 test/Transforms/NewGVN/cyclic-phi-handling.ll
 create mode 100644 test/Transforms/NewGVN/memory-handling.ll
 create mode 100644 test/Transforms/NewGVN/pr31501.ll
 create mode 100644 test/Transforms/NewGVN/pr31573.ll
 create mode 100644 test/tools/llvm-opt-report/Inputs/dm.c
 create mode 100644 test/tools/llvm-opt-report/Inputs/dm.yaml
 create mode 100644 test/tools/llvm-opt-report/func-dm.test

diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index d76f1293d02c..4288cf4bdd04 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -316,9 +316,9 @@ else()
   endif()
 endif()
 
-check_cxx_compiler_flag("-Wno-variadic-macros" SUPPORTS_NO_VARIADIC_MACROS_FLAG)
-check_cxx_compiler_flag("-Wno-gnu-zero-variadic-macro-arguments"
-                        SUPPORTS_NO_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
+check_cxx_compiler_flag("-Wvariadic-macros" SUPPORTS_VARIADIC_MACROS_FLAG)
+check_cxx_compiler_flag("-Wgnu-zero-variadic-macro-arguments"
+                        SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
 
 set(USE_NO_MAYBE_UNINITIALIZED 0)
 set(USE_NO_UNINITIALIZED 0)
@@ -462,13 +462,6 @@ if( MSVC )
   if(LLVM_ENABLE_DIA_SDK AND NOT HAVE_DIA_SDK)
     message(FATAL_ERROR "DIA SDK not found. If you have both VS 2012 and 2013 installed, you may need to uninstall the former and re-install the latter afterwards.")
   endif()
-
-  # Normalize to 0/1 for lit.site.cfg
-  if(LLVM_ENABLE_DIA_SDK)
-    set(LLVM_ENABLE_DIA_SDK 1)
-  else()
-    set(LLVM_ENABLE_DIA_SDK 0)
-  endif()
 else()
   set(LLVM_ENABLE_DIA_SDK 0)
 endif( MSVC )
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index fbef1d04eac4..56ba1479d7ee 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -1011,11 +1011,11 @@ function(add_unittest test_suite test_name)
     list(APPEND LLVM_COMPILE_DEFINITIONS GTEST_HAS_PTHREAD=0)
   endif ()
 
-  if (SUPPORTS_NO_VARIADIC_MACROS_FLAG)
+  if (SUPPORTS_VARIADIC_MACROS_FLAG)
     list(APPEND LLVM_COMPILE_FLAGS "-Wno-variadic-macros")
   endif ()
   # Some parts of gtest rely on this GNU extension, don't warn on it.
-  if(SUPPORTS_NO_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
+  if(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
     list(APPEND LLVM_COMPILE_FLAGS "-Wno-gnu-zero-variadic-macro-arguments")
   endif()
 
@@ -1067,6 +1067,19 @@ function(llvm_add_go_executable binary pkgpath)
   endif()
 endfunction()
 
+# This function canonicalize the CMake variables passed by names
+# from CMake boolean to 0/1 suitable for passing into Python or C++,
+# in place.
+function(llvm_canonicalize_cmake_booleans)
+  foreach(var ${ARGN})
+    if(${var})
+      set(${var} 1 PARENT_SCOPE)
+    else()
+      set(${var} 0 PARENT_SCOPE)
+    endif()
+  endforeach()
+endfunction(llvm_canonicalize_cmake_booleans)
+
 # This function provides an automatic way to 'configure'-like generate a file
 # based on a set of common and custom variables, specifically targeting the
 # variables needed for the 'lit.site.cfg' files. This function bundles the
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 9dcffe1ac5fb..1a93f9aa5fd2 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -1491,6 +1491,8 @@ public:
 
   void print(raw_ostream &OS) const;
   void verify() const;
+  bool invalidate(Function &F, const PreservedAnalyses &PA,
+                  FunctionAnalysisManager::Invalidator &Inv);
 
   /// Collect parametric terms occurring in step expressions (first step of
   /// delinearization).
diff --git a/include/llvm/Analysis/TargetLibraryInfo.h b/include/llvm/Analysis/TargetLibraryInfo.h
index 196fbc7faa8d..8675882431d5 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/include/llvm/Analysis/TargetLibraryInfo.h
@@ -290,7 +290,7 @@ public:
   }
 
   /// Returns extension attribute kind to be used for i32 parameters
-  /// correpsonding to C-level int or unsigned int.  May be zeroext, signext,
+  /// corresponding to C-level int or unsigned int.  May be zeroext, signext,
   /// or none.
   Attribute::AttrKind getExtAttrForI32Param(bool Signed = true) const {
     if (Impl->ShouldExtI32Param)
@@ -301,7 +301,7 @@ public:
   }
 
   /// Returns extension attribute kind to be used for i32 return values
-  /// correpsonding to C-level int or unsigned int.  May be zeroext, signext,
+  /// corresponding to C-level int or unsigned int.  May be zeroext, signext,
   /// or none.
   Attribute::AttrKind getExtAttrForI32Return(bool Signed = true) const {
     if (Impl->ShouldExtI32Return)
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index 92a9896d7a18..f3f5e324d76a 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -308,6 +308,16 @@ public:
   // Iteration support for live in sets.  These sets are kept in sorted
   // order by their register number.
   typedef LiveInVector::const_iterator livein_iterator;
+#ifndef NDEBUG
+  /// Unlike livein_begin, this method does not check that the liveness
+  /// information is accurate. Still for debug purposes it may be useful
+  /// to have iterators that won't assert if the liveness information
+  /// is not current.
+  livein_iterator livein_begin_dbg() const { return LiveIns.begin(); }
+  iterator_range<livein_iterator> liveins_dbg() const {
+    return make_range(livein_begin_dbg(), livein_end());
+  }
+#endif
   livein_iterator livein_begin() const;
   livein_iterator livein_end()   const { return LiveIns.end(); }
   bool            livein_empty() const { return LiveIns.empty(); }
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 2fab8137564e..4600c2c0f10c 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -148,8 +148,7 @@ class MachineFrameInfo {
   /// grouping overaligned allocas into a "secondary stack frame" and
   /// then only use a single alloca to allocate this frame and only a
   /// single virtual register to access it. Currently, without such an
-  /// optimization, each such alloca gets it's own dynamic
-  /// realignment.
+  /// optimization, each such alloca gets its own dynamic realignment.
   bool StackRealignable;
 
   /// Whether the function has the \c alignstack attribute.
diff --git a/include/llvm/DebugInfo/MSF/StreamArray.h b/include/llvm/DebugInfo/MSF/StreamArray.h
index d8b74bc75c94..3bba80d807f3 100644
--- a/include/llvm/DebugInfo/MSF/StreamArray.h
+++ b/include/llvm/DebugInfo/MSF/StreamArray.h
@@ -11,6 +11,7 @@
 #define LLVM_DEBUGINFO_MSF_STREAMARRAY_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/DebugInfo/MSF/StreamRef.h"
 #include "llvm/Support/Error.h"
 #include <cassert>
@@ -107,7 +108,10 @@ private:
   Extractor E;
 };
 
-template <typename ValueType, typename Extractor> class VarStreamArrayIterator {
+template <typename ValueType, typename Extractor>
+class VarStreamArrayIterator
+    : public iterator_facade_base<VarStreamArrayIterator<ValueType, Extractor>,
+                                  std::forward_iterator_tag, ValueType> {
   typedef VarStreamArrayIterator<ValueType, Extractor> IterType;
   typedef VarStreamArray<ValueType, Extractor> ArrayType;
 
@@ -144,41 +148,39 @@ public:
     return false;
   }
 
-  bool operator!=(const IterType &R) { return !(*this == R); }
-
   const ValueType &operator*() const {
     assert(Array && !HasError);
     return ThisValue;
   }
 
-  IterType &operator++() {
-    // We are done with the current record, discard it so that we are
-    // positioned at the next record.
-    IterRef = IterRef.drop_front(ThisLen);
-    if (IterRef.getLength() == 0) {
-      // There is nothing after the current record, we must make this an end
-      // iterator.
-      moveToEnd();
-    } else {
-      // There is some data after the current record.
-      auto EC = Extract(IterRef, ThisLen, ThisValue);
-      if (EC) {
-        consumeError(std::move(EC));
-        markError();
-      } else if (ThisLen == 0) {
-        // An empty record? Make this an end iterator.
+  IterType &operator+=(std::ptrdiff_t N) {
+    while (N > 0) {
+      // We are done with the current record, discard it so that we are
+      // positioned at the next record.
+      IterRef = IterRef.drop_front(ThisLen);
+      if (IterRef.getLength() == 0) {
+        // There is nothing after the current record, we must make this an end
+        // iterator.
         moveToEnd();
+        return *this;
+      } else {
+        // There is some data after the current record.
+        auto EC = Extract(IterRef, ThisLen, ThisValue);
+        if (EC) {
+          consumeError(std::move(EC));
+          markError();
+          return *this;
+        } else if (ThisLen == 0) {
+          // An empty record? Make this an end iterator.
+          moveToEnd();
+          return *this;
+        }
       }
+      --N;
     }
     return *this;
   }
 
-  IterType operator++(int) {
-    IterType Original = *this;
-    ++*this;
-    return Original;
-  }
-
 private:
   void moveToEnd() {
     Array = nullptr;
@@ -211,6 +213,16 @@ public:
     assert(Stream.getLength() % sizeof(T) == 0);
   }
 
+  bool operator==(const FixedStreamArray<T> &Other) const {
+    return Stream == Other.Stream;
+  }
+
+  bool operator!=(const FixedStreamArray<T> &Other) const {
+    return !(*this == Other);
+  }
+
+  FixedStreamArray &operator=(const FixedStreamArray &) = default;
+
   const T &operator[](uint32_t Index) const {
     assert(Index < size());
     uint32_t Off = Index * sizeof(T);
@@ -226,6 +238,8 @@ public:
 
   uint32_t size() const { return Stream.getLength() / sizeof(T); }
 
+  bool empty() const { return size() == 0; }
+
   FixedStreamArrayIterator<T> begin() const {
     return FixedStreamArrayIterator<T>(*this, 0);
   }
@@ -240,36 +254,53 @@ private:
   ReadableStreamRef Stream;
 };
 
-template <typename T> class FixedStreamArrayIterator {
+template <typename T>
+class FixedStreamArrayIterator
+    : public iterator_facade_base<FixedStreamArrayIterator<T>,
+                                  std::random_access_iterator_tag, T> {
+
 public:
   FixedStreamArrayIterator(const FixedStreamArray<T> &Array, uint32_t Index)
       : Array(Array), Index(Index) {}
 
-  bool operator==(const FixedStreamArrayIterator<T> &R) {
-    assert(&Array == &R.Array);
-    return Index == R.Index;
+  FixedStreamArrayIterator<T> &
+  operator=(const FixedStreamArrayIterator<T> &Other) {
+    Array = Other.Array;
+    Index = Other.Index;
+    return *this;
   }
 
-  bool operator!=(const FixedStreamArrayIterator<T> &R) {
-    return !(*this == R);
+  const T &operator*() const { return Array[Index]; }
+
+  bool operator==(const FixedStreamArrayIterator<T> &R) const {
+    assert(Array == R.Array);
+    return (Index == R.Index) && (Array == R.Array);
   }
 
-  const T &operator*() const { return Array[Index]; }
+  FixedStreamArrayIterator<T> &operator+=(std::ptrdiff_t N) {
+    Index += N;
+    return *this;
+  }
 
-  FixedStreamArrayIterator<T> &operator++() {
-    assert(Index < Array.size());
-    ++Index;
+  FixedStreamArrayIterator<T> &operator-=(std::ptrdiff_t N) {
+    assert(Index >= N);
+    Index -= N;
     return *this;
   }
 
-  FixedStreamArrayIterator<T> operator++(int) {
-    FixedStreamArrayIterator<T> Original = *this;
-    ++*this;
-    return Original;
+  std::ptrdiff_t operator-(const FixedStreamArrayIterator<T> &R) const {
+    assert(Array == R.Array);
+    assert(Index >= R.Index);
+    return Index - R.Index;
+  }
+
+  bool operator<(const FixedStreamArrayIterator<T> &RHS) const {
+    assert(Array == RHS.Array);
+    return Index < RHS.Index;
   }
 
 private:
-  const FixedStreamArray<T> &Array;
+  FixedStreamArray<T> Array;
   uint32_t Index;
 };
 
diff --git a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
index ab2b0fad89fd..3086ef0cdf80 100644
--- a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
+++ b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
@@ -83,7 +83,7 @@ public:
 namespace remote {
 
 class OrcRemoteTargetRPCAPI
-    : public rpc::SingleThreadedRPC<rpc::RawByteChannel> {
+    : public rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel> {
 protected:
   class ResourceIdMgr {
   public:
@@ -108,7 +108,7 @@ protected:
 public:
   // FIXME: Remove constructors once MSVC supports synthesizing move-ops.
   OrcRemoteTargetRPCAPI(rpc::RawByteChannel &C)
-      : rpc::SingleThreadedRPC<rpc::RawByteChannel>(C, true) {}
+      : rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel>(C, true) {}
 
   class CallIntVoid
       : public rpc::Function<CallIntVoid, int32_t(JITTargetAddress Addr)> {
diff --git a/include/llvm/ExecutionEngine/Orc/RPCUtils.h b/include/llvm/ExecutionEngine/Orc/RPCUtils.h
index f51fbe153a41..37e2e66e5af4 100644
--- a/include/llvm/ExecutionEngine/Orc/RPCUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/RPCUtils.h
@@ -702,7 +702,7 @@ public:
 /// sync.
 template <typename ImplT, typename ChannelT, typename FunctionIdT,
           typename SequenceNumberT>
-class RPCBase {
+class RPCEndpointBase {
 protected:
   class OrcRPCInvalid : public Function<OrcRPCInvalid, void()> {
   public:
@@ -747,7 +747,7 @@ protected:
 
 public:
   /// Construct an RPC instance on a channel.
-  RPCBase(ChannelT &C, bool LazyAutoNegotiation)
+  RPCEndpointBase(ChannelT &C, bool LazyAutoNegotiation)
       : C(C), LazyAutoNegotiation(LazyAutoNegotiation) {
     // Hold ResponseId in a special variable, since we expect Response to be
     // called relatively frequently, and want to avoid the map lookup.
@@ -788,15 +788,21 @@ public:
       return FnIdOrErr.takeError();
     }
 
-    // Allocate a sequence number.
-    auto SeqNo = SequenceNumberMgr.getSequenceNumber();
-    assert(!PendingResponses.count(SeqNo) &&
-           "Sequence number already allocated");
+    SequenceNumberT SeqNo; // initialized in locked scope below.
+    {
+      // Lock the pending responses map and sequence number manager.
+      std::lock_guard<std::mutex> Lock(ResponsesMutex);
+
+      // Allocate a sequence number.
+      SeqNo = SequenceNumberMgr.getSequenceNumber();
+      assert(!PendingResponses.count(SeqNo) &&
+             "Sequence number already allocated");
 
-    // Install the user handler.
-    PendingResponses[SeqNo] =
+      // Install the user handler.
+      PendingResponses[SeqNo] =
         detail::createResponseHandler<ChannelT, typename Func::ReturnType>(
             std::move(Handler));
+    }
 
     // Open the function call message.
     if (auto Err = C.startSendMessage(FnId, SeqNo)) {
@@ -863,11 +869,33 @@ public:
     return detail::ReadArgs<ArgTs...>(Args...);
   }
 
+  /// Abandon all outstanding result handlers.
+  ///
+  /// This will call all currently registered result handlers to receive an
+  /// "abandoned" error as their argument. This is used internally by the RPC
+  /// in error situations, but can also be called directly by clients who are
+  /// disconnecting from the remote and don't or can't expect responses to their
+  /// outstanding calls. (Especially for outstanding blocking calls, calling
+  /// this function may be necessary to avoid dead threads).
+  void abandonPendingResponses() {
+    // Lock the pending responses map and sequence number manager.
+    std::lock_guard<std::mutex> Lock(ResponsesMutex);
+
+    for (auto &KV : PendingResponses)
+      KV.second->abandon();
+    PendingResponses.clear();
+    SequenceNumberMgr.reset();
+  }
+
 protected:
   // The LaunchPolicy type allows a launch policy to be specified when adding
   // a function handler. See addHandlerImpl.
   using LaunchPolicy = std::function<Error(std::function<Error()>)>;
 
+  FunctionIdT getInvalidFunctionId() const {
+    return FnIdAllocator.getInvalidId();
+  }
+
   /// Add the given handler to the handler map and make it available for
   /// autonegotiation and execution.
   template <typename Func, typename HandlerT>
@@ -884,28 +912,32 @@ protected:
         wrapHandler<Func>(std::move(Handler), std::move(Launch));
   }
 
-  // Abandon all outstanding results.
-  void abandonPendingResponses() {
-    for (auto &KV : PendingResponses)
-      KV.second->abandon();
-    PendingResponses.clear();
-    SequenceNumberMgr.reset();
-  }
-
   Error handleResponse(SequenceNumberT SeqNo) {
-    auto I = PendingResponses.find(SeqNo);
-    if (I == PendingResponses.end()) {
-      abandonPendingResponses();
-      return orcError(OrcErrorCode::UnexpectedRPCResponse);
+    using Handler = typename decltype(PendingResponses)::mapped_type;
+    Handler PRHandler;
+
+    {
+      // Lock the pending responses map and sequence number manager.
+      std::unique_lock<std::mutex> Lock(ResponsesMutex);
+      auto I = PendingResponses.find(SeqNo);
+
+      if (I != PendingResponses.end()) {
+        PRHandler = std::move(I->second);
+        PendingResponses.erase(I);
+        SequenceNumberMgr.releaseSequenceNumber(SeqNo);
+      } else {
+        // Unlock the pending results map to prevent recursive lock.
+        Lock.unlock();
+        abandonPendingResponses();
+        return orcError(OrcErrorCode::UnexpectedRPCResponse);
+      }
     }
 
-    auto PRHandler = std::move(I->second);
-    PendingResponses.erase(I);
-    SequenceNumberMgr.releaseSequenceNumber(SeqNo);
+    assert(PRHandler &&
+           "If we didn't find a response handler we should have bailed out");
 
     if (auto Err = PRHandler->handleResponse(C)) {
       abandonPendingResponses();
-      SequenceNumberMgr.reset();
       return Err;
     }
 
@@ -915,7 +947,7 @@ protected:
   FunctionIdT handleNegotiate(const std::string &Name) {
     auto I = LocalFunctionIds.find(Name);
     if (I == LocalFunctionIds.end())
-      return FnIdAllocator.getInvalidId();
+      return getInvalidFunctionId();
     return I->second;
   }
 
@@ -938,7 +970,7 @@ protected:
 
         // If autonegotiation indicates that the remote end doesn't support this
         // function, return an unknown function error.
-        if (RemoteId == FnIdAllocator.getInvalidId())
+        if (RemoteId == getInvalidFunctionId())
           return orcError(OrcErrorCode::UnknownRPCFunction);
 
         // Autonegotiation succeeded and returned a valid id. Update the map and
@@ -1012,6 +1044,7 @@ protected:
 
   std::map<FunctionIdT, WrappedHandlerFn> Handlers;
 
+  std::mutex ResponsesMutex;
   detail::SequenceNumberManager<SequenceNumberT> SequenceNumberMgr;
   std::map<SequenceNumberT, std::unique_ptr<detail::ResponseHandler<ChannelT>>>
       PendingResponses;
@@ -1021,17 +1054,18 @@ protected:
 
 template <typename ChannelT, typename FunctionIdT = uint32_t,
           typename SequenceNumberT = uint32_t>
-class MultiThreadedRPC
-    : public detail::RPCBase<
-          MultiThreadedRPC<ChannelT, FunctionIdT, SequenceNumberT>, ChannelT,
-          FunctionIdT, SequenceNumberT> {
+class MultiThreadedRPCEndpoint
+    : public detail::RPCEndpointBase<
+          MultiThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+          ChannelT, FunctionIdT, SequenceNumberT> {
 private:
   using BaseClass =
-      detail::RPCBase<MultiThreadedRPC<ChannelT, FunctionIdT, SequenceNumberT>,
-                      ChannelT, FunctionIdT, SequenceNumberT>;
+      detail::RPCEndpointBase<
+        MultiThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+        ChannelT, FunctionIdT, SequenceNumberT>;
 
 public:
-  MultiThreadedRPC(ChannelT &C, bool LazyAutoNegotiation)
+  MultiThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
       : BaseClass(C, LazyAutoNegotiation) {}
 
   /// The LaunchPolicy type allows a launch policy to be specified when adding
@@ -1061,30 +1095,41 @@ public:
                                                std::move(Launch));
   }
 
+  /// Add a class-method as a handler.
+  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
+  void addHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...),
+                  LaunchPolicy Launch = LaunchPolicy()) {
+    addHandler<Func>(
+      detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method),
+      Launch);
+  }
+
   /// Negotiate a function id for Func with the other end of the channel.
-  template <typename Func> Error negotiateFunction() {
+  template <typename Func> Error negotiateFunction(bool Retry = false) {
     using OrcRPCNegotiate = typename BaseClass::OrcRPCNegotiate;
 
+    // Check if we already have a function id...
+    auto I = this->RemoteFunctionIds.find(Func::getPrototype());
+    if (I != this->RemoteFunctionIds.end()) {
+      // If it's valid there's nothing left to do.
+      if (I->second != this->getInvalidFunctionId())
+        return Error::success();
+      // If it's invalid and we can't re-attempt negotiation, throw an error.
+      if (!Retry)
+        return orcError(OrcErrorCode::UnknownRPCFunction);
+    }
+
+    // We don't have a function id for Func yet, call the remote to try to
+    // negotiate one.
     if (auto RemoteIdOrErr = callB<OrcRPCNegotiate>(Func::getPrototype())) {
       this->RemoteFunctionIds[Func::getPrototype()] = *RemoteIdOrErr;
+      if (*RemoteIdOrErr == this->getInvalidFunctionId())
+        return orcError(OrcErrorCode::UnknownRPCFunction);
       return Error::success();
     } else
       return RemoteIdOrErr.takeError();
   }
 
-  /// Convenience method for negotiating multiple functions at once.
-  template <typename Func> Error negotiateFunctions() {
-    return negotiateFunction<Func>();
-  }
-
-  /// Convenience method for negotiating multiple functions at once.
-  template <typename Func1, typename Func2, typename... Funcs>
-  Error negotiateFunctions() {
-    if (auto Err = negotiateFunction<Func1>())
-      return Err;
-    return negotiateFunctions<Func2, Funcs...>();
-  }
-
   /// Return type for non-blocking call primitives.
   template <typename Func>
   using NonBlockingCallResult = typename detail::ResultTraits<
@@ -1169,19 +1214,20 @@ public:
 
 template <typename ChannelT, typename FunctionIdT = uint32_t,
           typename SequenceNumberT = uint32_t>
-class SingleThreadedRPC
-    : public detail::RPCBase<
-          SingleThreadedRPC<ChannelT, FunctionIdT, SequenceNumberT>, ChannelT,
-          FunctionIdT, SequenceNumberT> {
+class SingleThreadedRPCEndpoint
+    : public detail::RPCEndpointBase<
+          SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+          ChannelT, FunctionIdT, SequenceNumberT> {
 private:
   using BaseClass =
-      detail::RPCBase<SingleThreadedRPC<ChannelT, FunctionIdT, SequenceNumberT>,
-                      ChannelT, FunctionIdT, SequenceNumberT>;
+      detail::RPCEndpointBase<
+        SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+        ChannelT, FunctionIdT, SequenceNumberT>;
 
   using LaunchPolicy = typename BaseClass::LaunchPolicy;
 
 public:
-  SingleThreadedRPC(ChannelT &C, bool LazyAutoNegotiation)
+  SingleThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
       : BaseClass(C, LazyAutoNegotiation) {}
 
   template <typename Func, typename HandlerT>
@@ -1197,29 +1243,31 @@ public:
   }
 
   /// Negotiate a function id for Func with the other end of the channel.
-  template <typename Func> Error negotiateFunction() {
+  template <typename Func> Error negotiateFunction(bool Retry = false) {
     using OrcRPCNegotiate = typename BaseClass::OrcRPCNegotiate;
 
+    // Check if we already have a function id...
+    auto I = this->RemoteFunctionIds.find(Func::getPrototype());
+    if (I != this->RemoteFunctionIds.end()) {
+      // If it's valid there's nothing left to do.
+      if (I->second != this->getInvalidFunctionId())
+        return Error::success();
+      // If it's invalid and we can't re-attempt negotiation, throw an error.
+      if (!Retry)
+        return orcError(OrcErrorCode::UnknownRPCFunction);
+    }
+
+    // We don't have a function id for Func yet, call the remote to try to
+    // negotiate one.
     if (auto RemoteIdOrErr = callB<OrcRPCNegotiate>(Func::getPrototype())) {
       this->RemoteFunctionIds[Func::getPrototype()] = *RemoteIdOrErr;
+      if (*RemoteIdOrErr == this->getInvalidFunctionId())
+        return orcError(OrcErrorCode::UnknownRPCFunction);
       return Error::success();
     } else
       return RemoteIdOrErr.takeError();
   }
 
-  /// Convenience method for negotiating multiple functions at once.
-  template <typename Func> Error negotiateFunctions() {
-    return negotiateFunction<Func>();
-  }
-
-  /// Convenience method for negotiating multiple functions at once.
-  template <typename Func1, typename Func2, typename... Funcs>
-  Error negotiateFunctions() {
-    if (auto Err = negotiateFunction<Func1>())
-      return Err;
-    return negotiateFunctions<Func2, Funcs...>();
-  }
-
   template <typename Func, typename... ArgTs,
             typename AltRetT = typename Func::ReturnType>
   typename detail::ResultTraits<AltRetT>::ErrorReturnType
@@ -1332,6 +1380,68 @@ private:
   uint32_t NumOutstandingCalls;
 };
 
+/// @brief Convenience class for grouping RPC Functions into APIs that can be
+///        negotiated as a block.
+///
+template <typename... Funcs>
+class APICalls {
+public:
+
+  /// @brief Test whether this API contains Function F.
+  template <typename F>
+  class Contains {
+  public:
+    static const bool value = false;
+  };
+
+  /// @brief Negotiate all functions in this API.
+  template <typename RPCEndpoint>
+  static Error negotiate(RPCEndpoint &R) {
+    return Error::success();
+  }
+};
+
+template <typename Func, typename... Funcs>
+class APICalls<Func, Funcs...> {
+public:
+
+  template <typename F>
+  class Contains {
+  public:
+    static const bool value = std::is_same<F, Func>::value |
+                              APICalls<Funcs...>::template Contains<F>::value;
+  };
+
+  template <typename RPCEndpoint>
+  static Error negotiate(RPCEndpoint &R) {
+    if (auto Err = R.template negotiateFunction<Func>())
+      return Err;
+    return APICalls<Funcs...>::negotiate(R);
+  }
+
+};
+
+template <typename... InnerFuncs, typename... Funcs>
+class APICalls<APICalls<InnerFuncs...>, Funcs...> {
+public:
+
+  template <typename F>
+  class Contains {
+  public:
+    static const bool value =
+      APICalls<InnerFuncs...>::template Contains<F>::value |
+      APICalls<Funcs...>::template Contains<F>::value;
+  };
+
+  template <typename RPCEndpoint>
+  static Error negotiate(RPCEndpoint &R) {
+    if (auto Err = APICalls<InnerFuncs...>::negotiate(R))
+      return Err;
+    return APICalls<Funcs...>::negotiate(R);
+  }
+
+};
+
 } // end namespace rpc
 } // end namespace orc
 } // end namespace llvm
diff --git a/include/llvm/ExecutionEngine/Orc/RawByteChannel.h b/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
index 83a7b9a844f2..3b6c84eb1965 100644
--- a/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
+++ b/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
@@ -48,9 +48,7 @@ public:
   template <typename FunctionIdT, typename SequenceIdT>
   Error startSendMessage(const FunctionIdT &FnId, const SequenceIdT &SeqNo) {
     writeLock.lock();
-    if (auto Err = serializeSeq(*this, FnId, SeqNo))
-      return Err;
-    return Error::success();
+    return serializeSeq(*this, FnId, SeqNo);
   }
 
   /// Notify the channel that we're ending a message send.
diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h
index a8c8ff9ef2eb..aeb66633f2c8 100644
--- a/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -28,14 +28,14 @@ template <> struct ScalarEnumerationTraits<TypeTestResolution::Kind> {
 
 template <> struct MappingTraits<TypeTestResolution> {
   static void mapping(IO &io, TypeTestResolution &res) {
-    io.mapRequired("Kind", res.TheKind);
-    io.mapRequired("SizeBitWidth", res.SizeBitWidth);
+    io.mapOptional("Kind", res.TheKind);
+    io.mapOptional("SizeBitWidth", res.SizeBitWidth);
   }
 };
 
 template <> struct MappingTraits<TypeIdSummary> {
   static void mapping(IO &io, TypeIdSummary& summary) {
-    io.mapRequired("TTRes", summary.TTRes);
+    io.mapOptional("TTRes", summary.TTRes);
   }
 };
 
@@ -53,7 +53,7 @@ namespace yaml {
 
 template <> struct MappingTraits<FunctionSummaryYaml> {
   static void mapping(IO &io, FunctionSummaryYaml& summary) {
-    io.mapRequired("TypeTests", summary.TypeTests);
+    io.mapOptional("TypeTests", summary.TypeTests);
   }
 };
 
@@ -100,8 +100,8 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
 
 template <> struct MappingTraits<ModuleSummaryIndex> {
   static void mapping(IO &io, ModuleSummaryIndex& index) {
-    io.mapRequired("GlobalValueMap", index.GlobalValueMap);
-    io.mapRequired("TypeIdMap", index.TypeIdMap);
+    io.mapOptional("GlobalValueMap", index.GlobalValueMap);
+    io.mapOptional("TypeIdMap", index.TypeIdMap);
   }
 };
 
diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
index 7a63956f1cdb..2e95f67a14a9 100644
--- a/include/llvm/IR/PassManager.h
+++ b/include/llvm/IR/PassManager.h
@@ -879,18 +879,22 @@ extern template class AnalysisManager<Function>;
 /// \brief Convenience typedef for the Function analysis manager.
 typedef AnalysisManager<Function> FunctionAnalysisManager;
 
-/// \brief A module analysis which acts as a proxy for a function analysis
-/// manager.
+/// \brief An analysis over an "outer" IR unit that provides access to an
+/// analysis manager over an "inner" IR unit.  The inner unit must be contained
+/// in the outer unit.
 ///
-/// This primarily proxies invalidation information from the module analysis
-/// manager and module pass manager to a function analysis manager. You should
-/// never use a function analysis manager from within (transitively) a module
-/// pass manager unless your parent module pass has received a proxy result
-/// object for it.
+/// Fore example, InnerAnalysisManagerProxy<FunctionAnalysisManager, Module> is
+/// an analysis over Modules (the "outer" unit) that provides access to a
+/// Function analysis manager.  The FunctionAnalysisManager is the "inner"
+/// manager being proxied, and Functions are the "inner" unit.  The inner/outer
+/// relationship is valid because each Function is contained in one Module.
 ///
-/// Note that the proxy's result is a move-only object and represents ownership
-/// of the validity of the analyses in the \c FunctionAnalysisManager it
-/// provides.
+/// If you're (transitively) within a pass manager for an IR unit U that
+/// contains IR unit V, you should never use an analysis manager over V, except
+/// via one of these proxies.
+///
+/// Note that the proxy's result is a move-only RAII object.  The validity of
+/// the analyses in the inner analysis manager is tied to its lifetime.
 template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
 class InnerAnalysisManagerProxy
     : public AnalysisInfoMixin<
@@ -926,23 +930,16 @@ public:
     /// \brief Accessor for the analysis manager.
     AnalysisManagerT &getManager() { return *InnerAM; }
 
-    /// \brief Handler for invalidation of the outer IR unit.
-    ///
-    /// If this analysis itself is preserved, then we assume that the set of \c
-    /// IR units that the inner analysis manager controls hasn't changed and
-    /// thus we don't need to invalidate *all* cached data associated with any
-    /// \c IRUnitT* in the \c AnalysisManagerT.
+    /// \brief Handler for invalidation of the outer IR unit, \c IRUnitT.
     ///
-    /// Regardless of whether this analysis is marked as preserved, all of the
-    /// analyses in the \c AnalysisManagerT are potentially invalidated (for
-    /// the relevant inner set of their IR units) based on the set of preserved
-    /// analyses.
+    /// If the proxy analysis itself is not preserved, we assume that the set of
+    /// inner IR objects contained in IRUnit may have changed.  In this case,
+    /// we have to call \c clear() on the inner analysis manager, as it may now
+    /// have stale pointers to its inner IR objects.
     ///
-    /// Because this needs to understand the mapping from one IR unit to an
-    /// inner IR unit, this method isn't defined in the primary template.
-    /// Instead, each specialization of this template will need to provide an
-    /// explicit specialization of this method to handle that particular pair
-    /// of IR unit and inner AnalysisManagerT.
+    /// Regardless of whether the proxy analysis is marked as preserved, all of
+    /// the analyses in the inner analysis manager are potentially invalidated
+    /// based on the set of preserved analyses.
     bool invalidate(
         IRUnitT &IR, const PreservedAnalyses &PA,
         typename AnalysisManager<IRUnitT, ExtraArgTs...>::Invalidator &Inv);
@@ -956,13 +953,9 @@ public:
 
   /// \brief Run the analysis pass and create our proxy result object.
   ///
-  /// This doesn't do any interesting work, it is primarily used to insert our
-  /// proxy result object into the module analysis cache so that we can proxy
-  /// invalidation to the function analysis manager.
-  ///
-  /// In debug builds, it will also assert that the analysis manager is empty
-  /// as no queries should arrive at the function analysis manager prior to
-  /// this analysis being requested.
+  /// This doesn't do any interesting work; it is primarily used to insert our
+  /// proxy result object into the outer analysis cache so that we can proxy
+  /// invalidation to the inner analysis manager.
   Result run(IRUnitT &IR, AnalysisManager<IRUnitT, ExtraArgTs...> &AM,
              ExtraArgTs...) {
     return Result(*InnerAM);
@@ -996,22 +989,24 @@ bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
 extern template class InnerAnalysisManagerProxy<FunctionAnalysisManager,
                                                 Module>;
 
-/// \brief A function analysis which acts as a proxy for a module analysis
-/// manager.
+/// \brief An analysis over an "inner" IR unit that provides access to an
+/// analysis manager over a "outer" IR unit.  The inner unit must be contained
+/// in the outer unit.
 ///
-/// This primarily provides an accessor to a parent module analysis manager to
-/// function passes. Only the const interface of the module analysis manager is
-/// provided to indicate that once inside of a function analysis pass you
-/// cannot request a module analysis to actually run. Instead, the user must
-/// rely on the \c getCachedResult API.
+/// For example OuterAnalysisManagerProxy<ModuleAnalysisManager, Function> is an
+/// analysis over Functions (the "inner" unit) which provides access to a Module
+/// analysis manager.  The ModuleAnalysisManager is the "outer" manager being
+/// proxied, and Modules are the "outer" IR unit.  The inner/outer relationship
+/// is valid because each Function is contained in one Module.
 ///
-/// The invalidation provided by this proxy involves tracking when an
-/// invalidation event in the outer analysis manager needs to trigger an
-/// invalidation of a particular analysis on this IR unit.
+/// This proxy only exposes the const interface of the outer analysis manager,
+/// to indicate that you cannot cause an outer analysis to run from within an
+/// inner pass.  Instead, you must rely on the \c getCachedResult API.
 ///
-/// Because outer analyses aren't invalidated while these IR units are being
-/// precessed, we have to register and handle these as deferred invalidation
-/// events.
+/// This proxy doesn't manage invalidation in any way -- that is handled by the
+/// recursive return path of each layer of the pass manager.  A consequence of
+/// this is the outer analyses may be stale.  We invalidate the outer analyses
+/// only when we're done running passes over the inner IR units.
 template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
 class OuterAnalysisManagerProxy
     : public AnalysisInfoMixin<
@@ -1024,7 +1019,7 @@ public:
 
     const AnalysisManagerT &getManager() const { return *AM; }
 
-    /// \brief Handle invalidation by ignoring it, this pass is immutable.
+    /// \brief Handle invalidation by ignoring it; this pass is immutable.
     bool invalidate(
         IRUnitT &, const PreservedAnalyses &,
         typename AnalysisManager<IRUnitT, ExtraArgTs...>::Invalidator &) {
@@ -1089,18 +1084,15 @@ AnalysisKey
 
 extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
                                                 Function>;
-/// Provide the \c ModuleAnalysisManager to \c Fucntion proxy.
+/// Provide the \c ModuleAnalysisManager to \c Function proxy.
 typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>
     ModuleAnalysisManagerFunctionProxy;
 
 /// \brief Trivial adaptor that maps from a module to its functions.
 ///
 /// Designed to allow composition of a FunctionPass(Manager) and
-/// a ModulePassManager. Note that if this pass is constructed with a pointer
-/// to a \c ModuleAnalysisManager it will run the
-/// \c FunctionAnalysisManagerModuleProxy analysis prior to running the function
-/// pass over the module to enable a \c FunctionAnalysisManager to be used
-/// within this run safely.
+/// a ModulePassManager, by running the FunctionPass(Manager) over every
+/// function in the module.
 ///
 /// Function passes run within this adaptor can rely on having exclusive access
 /// to the function they are run over. They should not read or modify any other
@@ -1115,6 +1107,10 @@ typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>
 /// module.
 /// FIXME: Make the above true for all of LLVM's actual passes, some still
 /// violate this principle.
+///
+/// Note that although function passes can access module analyses, module
+/// analyses are not invalidated while the function passes are running, so they
+/// may be stale.  Function analyses will not be stale.
 template <typename FunctionPassT>
 class ModuleToFunctionPassAdaptor
     : public PassInfoMixin<ModuleToFunctionPassAdaptor<FunctionPassT>> {
@@ -1124,7 +1120,6 @@ public:
 
   /// \brief Runs the function pass across every function in the module.
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) {
-    // Setup the function analysis manager from its proxy.
     FunctionAnalysisManager &FAM =
         AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
@@ -1145,10 +1140,11 @@ public:
       PA.intersect(std::move(PassPA));
     }
 
-    // By definition we preserve the proxy. We also preserve all analyses on
-    // Function units. This precludes *any* invalidation of function analyses
-    // by the proxy, but that's OK because we've taken care to invalidate
-    // analyses in the function analysis manager incrementally above.
+    // The FunctionAnalysisManagerModuleProxy is preserved because (we assume)
+    // the function passes we ran didn't add or remove any functions.
+    //
+    // We also preserve all analyses on Functions, because we did all the
+    // invalidation we needed to do above.
     PA.preserveSet<AllAnalysesOn<Function>>();
     PA.preserve<FunctionAnalysisManagerModuleProxy>();
     return PA;
@@ -1166,7 +1162,7 @@ createModuleToFunctionPassAdaptor(FunctionPassT Pass) {
   return ModuleToFunctionPassAdaptor<FunctionPassT>(std::move(Pass));
 }
 
-/// \brief A template utility pass to force an analysis result to be available.
+/// \brief A utility pass template to force an analysis result to be available.
 ///
 /// If there are extra arguments at the pass's run level there may also be
 /// extra arguments to the analysis manager's \c getResult routine. We can't
@@ -1196,17 +1192,14 @@ struct RequireAnalysisPass
   }
 };
 
-/// \brief A template utility pass to force an analysis result to be
-/// invalidated.
-///
-/// This is a no-op pass which simply forces a specific analysis result to be
-/// invalidated when it is run.
+/// \brief A no-op pass template which simply forces a specific analysis result
+/// to be invalidated.
 template <typename AnalysisT>
 struct InvalidateAnalysisPass
     : PassInfoMixin<InvalidateAnalysisPass<AnalysisT>> {
   /// \brief Run this pass over some unit of IR.
   ///
-  /// This pass can be run over any unit of IR and use any analysis manager
+  /// This pass can be run over any unit of IR and use any analysis manager,
   /// provided they satisfy the basic API requirements. When this pass is
   /// created, these methods can be instantiated to satisfy whatever the
   /// context requires.
@@ -1218,10 +1211,10 @@ struct InvalidateAnalysisPass
   }
 };
 
-/// \brief A utility pass that does nothing but preserves no analyses.
+/// \brief A utility pass that does nothing, but preserves no analyses.
 ///
-/// As a consequence fo not preserving any analyses, this pass will force all
-/// analysis passes to be re-run to produce fresh results if any are needed.
+/// Because this preserves no analyses, any analysis passes queried after this
+/// pass runs will recompute fresh results.
 struct InvalidateAllAnalysesPass : PassInfoMixin<InvalidateAllAnalysesPass> {
   /// \brief Run this pass over some unit of IR.
   template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index e6fe97484580..c907d6b670b5 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -238,6 +238,26 @@ public:
     return make_range(value_op_begin(), value_op_end());
   }
 
+  struct const_value_op_iterator
+      : iterator_adaptor_base<const_value_op_iterator, const_op_iterator,
+                              std::random_access_iterator_tag, const Value *,
+                              ptrdiff_t, const Value *, const Value *> {
+    explicit const_value_op_iterator(const Use *U = nullptr) :
+      iterator_adaptor_base(U) {}
+    const Value *operator*() const { return *I; }
+    const Value *operator->() const { return operator*(); }
+  };
+
+  const_value_op_iterator value_op_begin() const {
+    return const_value_op_iterator(op_begin());
+  }
+  const_value_op_iterator value_op_end() const {
+    return const_value_op_iterator(op_end());
+  }
+  iterator_range<const_value_op_iterator> operand_values() const {
+    return make_range(value_op_begin(), value_op_end());
+  }
+
   /// \brief Drop all references to operands.
   ///
   /// This function is in charge of "letting go" of all objects that this User
diff --git a/include/llvm/Support/Path.h b/include/llvm/Support/Path.h
index 0513350d446b..2bbcef0c293f 100644
--- a/include/llvm/Support/Path.h
+++ b/include/llvm/Support/Path.h
@@ -207,6 +207,14 @@ void native(const Twine &path, SmallVectorImpl<char> &result);
 /// @param path A path that is transformed to native format.
 void native(SmallVectorImpl<char> &path);
 
+/// @brief Replaces backslashes with slashes if Windows.
+///
+/// @param path processed path
+/// @result The result of replacing backslashes with forward slashes if Windows.
+/// On Unix, this function is a no-op because backslashes are valid path
+/// chracters.
+std::string convert_to_slash(StringRef path);
+
 /// @}
 /// @name Lexical Observers
 /// @{
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index 4bebc863b4a9..dd55062e56f1 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -215,9 +215,20 @@ ModulePass *createMetaRenamerPass();
 /// manager.
 ModulePass *createBarrierNoopPass();
 
+/// What to do with the summary when running the LowerTypeTests pass.
+enum class LowerTypeTestsSummaryAction {
+  None,   ///< Do nothing.
+  Import, ///< Import typeid resolutions from summary and globals.
+  Export, ///< Export typeid resolutions to summary and globals.
+};
+
 /// \brief This pass lowers type metadata and the llvm.type.test intrinsic to
 /// bitsets.
-ModulePass *createLowerTypeTestsPass();
+/// \param Action What to do with the summary passed as Index.
+/// \param Index The summary to use for importing or exporting, this can be null
+///              when Action is None.
+ModulePass *createLowerTypeTestsPass(LowerTypeTestsSummaryAction Action,
+                                     ModuleSummaryIndex *Index);
 
 /// \brief This pass export CFI checks for use by external modules.
 ModulePass *createCrossDSOCFIPass();
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 9f9ce467337e..abfb24f0fe50 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -21,7 +21,6 @@
 #include <vector>
 
 namespace llvm {
-class ModuleSummaryIndex;
 class Pass;
 class TargetLibraryInfoImpl;
 class TargetMachine;
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index b4686a1ff175..8da2f0981d0c 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1106,6 +1106,16 @@ static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
   if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse))
     return V;
 
+  // udiv %V, C -> 0 if %V < C
+  if (MaxRecurse) {
+    if (Constant *C = dyn_cast_or_null<Constant>(SimplifyICmpInst(
+            ICmpInst::ICMP_ULT, Op0, Op1, Q, MaxRecurse - 1))) {
+      if (C->isAllOnesValue()) {
+        return Constant::getNullValue(Op0->getType());
+      }
+    }
+  }
+
   return nullptr;
 }
 
@@ -1247,6 +1257,16 @@ static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
   if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse))
     return V;
 
+  // urem %V, C -> %V if %V < C
+  if (MaxRecurse) {
+    if (Constant *C = dyn_cast_or_null<Constant>(SimplifyICmpInst(
+            ICmpInst::ICMP_ULT, Op0, Op1, Q, MaxRecurse - 1))) {
+      if (C->isAllOnesValue()) {
+        return Op0;
+      }
+    }
+  }
+
   return nullptr;
 }
 
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 19c0171740c9..3d85ef6988a9 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -179,9 +179,9 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const {
 }
 
 bool Loop::isRecursivelyLCSSAForm(DominatorTree &DT, const LoopInfo &LI) const {
-  // For each block we check that it doesn't have any uses outside of it's
-  // innermost loop. This process will transitivelly guarntee that current loop 
-  // and all of the nested loops are in the LCSSA form.
+  // For each block we check that it doesn't have any uses outside of its
+  // innermost loop. This process will transitively guarantee that the current
+  // loop and all of the nested loops are in LCSSA form.
   return all_of(this->blocks(), [&](const BasicBlock *BB) {
     return isBlockInLCSSAForm(*LI.getLoopFor(BB), *BB, DT);
   });
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 2746361ab4b5..e7415e623196 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -344,38 +344,24 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
   if (!InvariantGroupMD)
     return MemDepResult::getUnknown();
 
-  Value *LoadOperand = LI->getPointerOperand();
+  // Take the ptr operand after all casts and geps 0. This way we can search
+  // cast graph down only.
+  Value *LoadOperand = LI->getPointerOperand()->stripPointerCasts();
+
   // It's is not safe to walk the use list of global value, because function
   // passes aren't allowed to look outside their functions.
+  // FIXME: this could be fixed by filtering instructions from outside
+  // of current function.
   if (isa<GlobalValue>(LoadOperand))
     return MemDepResult::getUnknown();
 
   // Queue to process all pointers that are equivalent to load operand.
   SmallVector<const Value *, 8> LoadOperandsQueue;
-  SmallSet<const Value *, 14> SeenValues;
-  auto TryInsertToQueue = [&](Value *V) {
-    if (SeenValues.insert(V).second)
-      LoadOperandsQueue.push_back(V);
-  };
-
-  TryInsertToQueue(LoadOperand);
+  LoadOperandsQueue.push_back(LoadOperand);
   while (!LoadOperandsQueue.empty()) {
     const Value *Ptr = LoadOperandsQueue.pop_back_val();
-    assert(Ptr);
-    if (isa<GlobalValue>(Ptr))
-      continue;
-
-    // Value comes from bitcast: Ptr = bitcast x. Insert x.
-    if (auto *BCI = dyn_cast<BitCastInst>(Ptr))
-      TryInsertToQueue(BCI->getOperand(0));
-    // Gep with zeros is equivalent to bitcast.
-    // FIXME: we are not sure if some bitcast should be canonicalized to gep 0
-    // or gep 0 to bitcast because of SROA, so there are 2 forms. When typeless
-    // pointers will be upstream then both cases will be gone (and this BFS
-    // also won't be needed).
-    if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr))
-      if (GEP->hasAllZeroIndices())
-        TryInsertToQueue(GEP->getOperand(0));
+    assert(Ptr && !isa<GlobalValue>(Ptr) &&
+           "Null or GlobalValue should not be inserted");
 
     for (const Use &Us : Ptr->uses()) {
       auto *U = dyn_cast<Instruction>(Us.getUser());
@@ -385,13 +371,17 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
       // Bitcast or gep with zeros are using Ptr. Add to queue to check it's
       // users.      U = bitcast Ptr
       if (isa<BitCastInst>(U)) {
-        TryInsertToQueue(U);
+        LoadOperandsQueue.push_back(U);
         continue;
       }
-      // U = getelementptr Ptr, 0, 0...
+      // Gep with zeros is equivalent to bitcast.
+      // FIXME: we are not sure if some bitcast should be canonicalized to gep 0
+      // or gep 0 to bitcast because of SROA, so there are 2 forms. When
+      // typeless pointers will be ready then both cases will be gone
+      // (and this BFS also won't be needed).
       if (auto *GEP = dyn_cast<GetElementPtrInst>(U))
         if (GEP->hasAllZeroIndices()) {
-          TryInsertToQueue(U);
+          LoadOperandsQueue.push_back(U);
           continue;
         }
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 5e566bcdaff4..44f1a6dde0d2 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -10012,6 +10012,18 @@ void ScalarEvolution::verify() const {
   // TODO: Verify more things.
 }
 
+bool ScalarEvolution::invalidate(
+    Function &F, const PreservedAnalyses &PA,
+    FunctionAnalysisManager::Invalidator &Inv) {
+  // Invalidate the ScalarEvolution object whenever it isn't preserved or one
+  // of its dependencies is invalidated.
+  auto PAC = PA.getChecker<ScalarEvolutionAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()) ||
+         Inv.invalidate<AssumptionAnalysis>(F, PA) ||
+         Inv.invalidate<DominatorTreeAnalysis>(F, PA) ||
+         Inv.invalidate<LoopAnalysis>(F, PA);
+}
+
 AnalysisKey ScalarEvolutionAnalysis::Key;
 
 ScalarEvolution ScalarEvolutionAnalysis::run(Function &F,
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 073b4e6ab26a..d31472c0d33c 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -3257,6 +3257,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
       case Intrinsic::dbg_value:
         return true;
 
+      case Intrinsic::bitreverse:
       case Intrinsic::bswap:
       case Intrinsic::ctlz:
       case Intrinsic::ctpop:
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index 460d39cc28d8..4a5d18e2db75 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -429,7 +429,7 @@ class MetadataLoader::MetadataLoaderImpl {
   /// Populate the index above to enable lazily loading of metadata, and load
   /// the named metadata as well as the transitively referenced global
   /// Metadata.
-  Expected<bool> lazyLoadModuleMetadataBlock(PlaceholderQueue &Placeholders);
+  Expected<bool> lazyLoadModuleMetadataBlock();
 
   /// On-demand loading of a single metadata. Requires the index above to be
   /// populated.
@@ -516,8 +516,8 @@ Error error(const Twine &Message) {
       Message, make_error_code(BitcodeError::CorruptedBitcode));
 }
 
-Expected<bool> MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock(
-    PlaceholderQueue &Placeholders) {
+Expected<bool>
+MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
   IndexCursor = Stream;
   SmallVector<uint64_t, 64> Record;
   // Get the abbrevs, and preload record positions to make them lazy-loadable.
@@ -701,7 +701,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
   // then load individual record as needed, starting with the named metadata.
   if (ModuleLevel && IsImporting && MetadataList.empty() &&
       !DisableLazyLoading) {
-    auto SuccessOrErr = lazyLoadModuleMetadataBlock(Placeholders);
+    auto SuccessOrErr = lazyLoadModuleMetadataBlock();
     if (!SuccessOrErr)
       return SuccessOrErr.takeError();
     if (SuccessOrErr.get()) {
@@ -1561,7 +1561,6 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
     return error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
-
   PlaceholderQueue Placeholders;
 
   while (true) {
@@ -1608,10 +1607,12 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
 
         auto Idx = Record[i + 1];
         if (Idx < (MDStringRef.size() + GlobalMetadataBitPosIndex.size()) &&
-            !MetadataList.lookup(Idx))
+            !MetadataList.lookup(Idx)) {
           // Load the attachment if it is in the lazy-loadable range and hasn't
           // been loaded yet.
           lazyLoadOneMetadata(Idx, Placeholders);
+          resolveForwardRefsAndPlaceholders(Placeholders);
+        }
 
         Metadata *Node = MetadataList.getMetadataFwdRef(Idx);
         if (isa<LocalAsMetadata>(Node))
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a37f4e1116b4..6b62f11f1240 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1714,7 +1714,7 @@ void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N,
   EVT CCT = getSetCCResultType(NVT);
 
   // Hi part is always the same op
-  Hi = DAG.getNode(N->getOpcode(), DL, {NVT, NVT}, {LHSH, RHSH});
+  Hi = DAG.getNode(N->getOpcode(), DL, NVT, {LHSH, RHSH});
 
   // We need to know whether to select Lo part that corresponds to 'winning'
   // Hi part or if Hi parts are equal.
@@ -1725,7 +1725,7 @@ void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N,
   SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL);
 
   // Recursed Lo part if Hi parts are equal, this uses unsigned version
-  SDValue LoMinMax = DAG.getNode(LoOpc, DL, {NVT, NVT}, {LHSL, RHSL});
+  SDValue LoMinMax = DAG.getNode(LoOpc, DL, NVT, {LHSL, RHSL});
 
   Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp);
 }
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index bae828a2263c..234b2043a6a1 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -381,7 +381,6 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
        I != E; ++I) {
     if (DCELimit != -1 && (int)NumDead >= DCELimit)
       break;
-
     int FirstSS, SecondSS;
     if (TII->isStackSlotCopy(*I, FirstSS, SecondSS) && FirstSS == SecondSS &&
         FirstSS != -1) {
@@ -392,12 +391,18 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
     }
 
     MachineBasicBlock::iterator NextMI = std::next(I);
-    if (NextMI == MBB->end()) continue;
+    MachineBasicBlock::iterator ProbableLoadMI = I;
 
     unsigned LoadReg = 0;
     unsigned StoreReg = 0;
     if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS)))
       continue;
+    // Skip the ...pseudo debugging... instructions between a load and store.
+    while ((NextMI != E) && NextMI->isDebugValue()) {
+      ++NextMI;
+      ++I;
+    }
+    if (NextMI == E) continue;
     if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS)))
       continue;
     if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue;
@@ -407,7 +412,7 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
 
     if (NextMI->findRegisterUseOperandIdx(LoadReg, true, nullptr) != -1) {
       ++NumDead;
-      toErase.push_back(&*I);
+      toErase.push_back(&*ProbableLoadMI);
     }
 
     toErase.push_back(&*NextMI);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index a5a30fab5b69..8f6b1849169a 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -896,6 +896,48 @@ uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType,
   return ELF::R_MIPS_NONE;
 }
 
+// Sometimes we don't need to create thunk for a branch.
+// This typically happens when branch target is located 
+// in the same object file. In such case target is either
+// a weak symbol or symbol in a different executable section.
+// This function checks if branch target is located in the
+// same object file and if distance between source and target
+// fits R_AARCH64_CALL26 relocation. If both conditions are
+// met, it emits direct jump to the target and returns true.
+// Otherwise false is returned and thunk is created.
+bool RuntimeDyldELF::resolveAArch64ShortBranch(
+    unsigned SectionID, relocation_iterator RelI,
+    const RelocationValueRef &Value) {
+  uint64_t Address;
+  if (Value.SymbolName) {
+    auto Loc = GlobalSymbolTable.find(Value.SymbolName);
+
+    // Don't create direct branch for external symbols.
+    if (Loc == GlobalSymbolTable.end())
+      return false;
+
+    const auto &SymInfo = Loc->second;
+    Address =
+        uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset(
+            SymInfo.getOffset()));
+  } else {
+    Address = uint64_t(Sections[Value.SectionID].getLoadAddress());
+  }
+  uint64_t Offset = RelI->getOffset();
+  uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset);
+
+  // R_AARCH64_CALL26 requires immediate to be in range -2^27 <= imm < 2^27
+  // If distance between source and target is out of range then we should
+  // create thunk.
+  if (!isInt<28>(Address + Value.Addend - SourceAddress))
+    return false;
+
+  resolveRelocation(Sections[SectionID], Offset, Address, RelI->getType(),
+                    Value.Addend);
+
+  return true;
+}
+
 Expected<relocation_iterator>
 RuntimeDyldELF::processRelocationRef(
     unsigned SectionID, relocation_iterator RelI, const ObjectFile &O,
@@ -1003,7 +1045,7 @@ RuntimeDyldELF::processRelocationRef(
                         (uint64_t)Section.getAddressWithOffset(i->second),
                         RelType, 0);
       DEBUG(dbgs() << " Stub function found\n");
-    } else {
+    } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
       // Create a new stub function.
       DEBUG(dbgs() << " Create a new stub function\n");
       Stubs[Value] = Section.getStubOffset();
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 796127ab92bd..d1867d091fe2 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -40,6 +40,9 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
   void resolveAArch64Relocation(const SectionEntry &Section, uint64_t Offset,
                                 uint64_t Value, uint32_t Type, int64_t Addend);
 
+  bool resolveAArch64ShortBranch(unsigned SectionID, relocation_iterator RelI,
+                                 const RelocationValueRef &Value);
+
   void resolveARMRelocation(const SectionEntry &Section, uint64_t Offset,
                             uint32_t Value, uint32_t Type, int32_t Addend);
 
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 66ffe6db29d6..928f69a17de9 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -196,8 +196,15 @@ crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
   };
 
   FunctionImporter Importer(Index, Loader);
-  if (!Importer.importFunctions(TheModule, ImportList))
+  Expected<bool> Result = Importer.importFunctions(TheModule, ImportList);
+  if (!Result) {
+    handleAllErrors(Result.takeError(), [&](ErrorInfoBase &EIB) {
+      SMDiagnostic Err = SMDiagnostic(TheModule.getModuleIdentifier(),
+                                      SourceMgr::DK_Error, EIB.message());
+      Err.print("ThinLTO", errs());
+    });
     report_fatal_error("importFunctions failed");
+  }
 }
 
 static void optimizeModule(Module &TheModule, TargetMachine &TM,
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 40105000c56c..5b018676eba3 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -2823,7 +2823,11 @@ StringRef MachORebaseEntry::typeName() const {
 }
 
 bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const {
+#ifdef EXPENSIVE_CHECKS
   assert(Opcodes == Other.Opcodes && "compare iterators of different files");
+#else
+  assert(Opcodes.data() == Other.Opcodes.data() && "compare iterators of different files");
+#endif
   return (Ptr == Other.Ptr) &&
          (RemainingLoopCount == Other.RemainingLoopCount) &&
          (Done == Other.Done);
@@ -3073,7 +3077,11 @@ uint32_t MachOBindEntry::flags() const { return Flags; }
 int MachOBindEntry::ordinal() const { return Ordinal; }
 
 bool MachOBindEntry::operator==(const MachOBindEntry &Other) const {
+#ifdef EXPENSIVE_CHECKS
   assert(Opcodes == Other.Opcodes && "compare iterators of different files");
+#else
+  assert(Opcodes.data() == Other.Opcodes.data() && "compare iterators of different files");
+#endif
   return (Ptr == Other.Ptr) &&
          (RemainingLoopCount == Other.RemainingLoopCount) &&
          (Done == Other.Done);
diff --git a/lib/Object/ModuleSummaryIndexObjectFile.cpp b/lib/Object/ModuleSummaryIndexObjectFile.cpp
index 202783e7d993..11ace84b9ceb 100644
--- a/lib/Object/ModuleSummaryIndexObjectFile.cpp
+++ b/lib/Object/ModuleSummaryIndexObjectFile.cpp
@@ -22,6 +22,12 @@
 using namespace llvm;
 using namespace object;
 
+static llvm::cl::opt<bool> IgnoreEmptyThinLTOIndexFile(
+    "ignore-empty-index-file", llvm::cl::ZeroOrMore,
+    llvm::cl::desc(
+        "Ignore an empty index file and perform non-ThinLTO compilation"),
+    llvm::cl::init(false));
+
 ModuleSummaryIndexObjectFile::ModuleSummaryIndexObjectFile(
     MemoryBufferRef Object, std::unique_ptr<ModuleSummaryIndex> I)
     : SymbolicFile(Binary::ID_ModuleSummaryIndex, Object), Index(std::move(I)) {
@@ -97,6 +103,8 @@ llvm::getModuleSummaryIndexForFile(StringRef Path) {
   if (EC)
     return errorCodeToError(EC);
   MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
+  if (IgnoreEmptyThinLTOIndexFile && !BufferRef.getBufferSize())
+    return nullptr;
   Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
       object::ModuleSummaryIndexObjectFile::create(BufferRef);
   if (!ObjOrErr)
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 0a989706b436..3889902eea54 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -373,7 +373,7 @@ void Option::removeArgument() { GlobalParser->removeOption(this); }
 void Option::setArgStr(StringRef S) {
   if (FullyInitialized)
     GlobalParser->updateArgStr(this, S);
-  assert(S[0] != '-' && "Option can't start with '-");
+  assert((S.empty() || S[0] != '-') && "Option can't start with '-");
   ArgStr = S;
 }
 
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 0616d05aff57..4bb035eeccca 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -571,6 +571,16 @@ void native(SmallVectorImpl<char> &Path) {
 #endif
 }
 
+std::string convert_to_slash(StringRef path) {
+#ifdef LLVM_ON_WIN32
+  std::string s = path.str();
+  std::replace(s.begin(), s.end(), '\\', '/');
+  return s;
+#else
+  return path;
+#endif
+}
+
 StringRef filename(StringRef path) {
   return *rbegin(path);
 }
diff --git a/lib/Support/TarWriter.cpp b/lib/Support/TarWriter.cpp
index 5fc17d276377..f79b364dc1f7 100644
--- a/lib/Support/TarWriter.cpp
+++ b/lib/Support/TarWriter.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Path.h"
 
 using namespace llvm;
 
@@ -109,27 +110,44 @@ static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) {
   pad(OS);
 }
 
+// In the Ustar header, a path can be split at any '/' to store
+// a path into UstarHeader::Name and UstarHeader::Prefix. This
+// function splits a given path for that purpose.
+static std::pair<StringRef, StringRef> splitPath(StringRef Path) {
+  if (Path.size() <= sizeof(UstarHeader::Name))
+    return {"", Path};
+  size_t Sep = Path.rfind('/', sizeof(UstarHeader::Name) + 1);
+  if (Sep == StringRef::npos)
+    return {"", Path};
+  return {Path.substr(0, Sep), Path.substr(Sep + 1)};
+}
+
+// Returns true if a given path can be stored to a Ustar header
+// without the PAX extension.
+static bool fitsInUstar(StringRef Path) {
+  StringRef Prefix;
+  StringRef Name;
+  std::tie(Prefix, Name) = splitPath(Path);
+  return Name.size() <= sizeof(UstarHeader::Name);
+}
+
 // The PAX header is an extended format, so a PAX header needs
 // to be followed by a "real" header.
 static void writeUstarHeader(raw_fd_ostream &OS, StringRef Path, size_t Size) {
+  StringRef Prefix;
+  StringRef Name;
+  std::tie(Prefix, Name) = splitPath(Path);
+
   UstarHeader Hdr = {};
-  memcpy(Hdr.Name, Path.data(), Path.size());
+  memcpy(Hdr.Name, Name.data(), Name.size());
   memcpy(Hdr.Mode, "0000664", 8);
   snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", Size);
   memcpy(Hdr.Magic, "ustar", 6);
+  memcpy(Hdr.Prefix, Prefix.data(), Prefix.size());
   computeChecksum(Hdr);
   OS << StringRef(reinterpret_cast<char *>(&Hdr), sizeof(Hdr));
 }
 
-// We want to use '/' as a path separator even on Windows.
-// This function canonicalizes a given path.
-static std::string canonicalize(std::string S) {
-#ifdef LLVM_ON_WIN32
-  std::replace(S.begin(), S.end(), '\\', '/');
-#endif
-  return S;
-}
-
 // Creates a TarWriter instance and returns it.
 Expected<std::unique_ptr<TarWriter>> TarWriter::create(StringRef OutputPath,
                                                        StringRef BaseDir) {
@@ -145,8 +163,8 @@ TarWriter::TarWriter(int FD, StringRef BaseDir)
 // Append a given file to an archive.
 void TarWriter::append(StringRef Path, StringRef Data) {
   // Write Path and Data.
-  std::string S = BaseDir + "/" + canonicalize(Path) + "\0";
-  if (S.size() <= sizeof(UstarHeader::Name)) {
+  std::string S = BaseDir + "/" + sys::path::convert_to_slash(Path) + "\0";
+  if (fitsInUstar(S)) {
     writeUstarHeader(OS, S, Data.size());
   } else {
     writePaxHeader(OS, S);
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ef3b44f7c211..2b4fc5397b18 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -608,6 +608,10 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
     Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
+             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
+    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
     Base = Addr.getOperand(0);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0b0a0e7d083e..730bcdcf7afa 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -172,16 +172,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
 
-  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
-  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
-
-  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
-  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
-
-  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
-  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
-  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
-
   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index a6c31629e7c4..da9d009c542b 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -822,6 +822,7 @@ public:
   bool isForcedVOP3() const { return ForcedEncodingSize == 64; }
   bool isForcedDPP() const { return ForcedDPP; }
   bool isForcedSDWA() const { return ForcedSDWA; }
+  ArrayRef<unsigned> getMatchedVariants() const;
 
   std::unique_ptr<AMDGPUOperand> parseRegister();
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
@@ -1630,31 +1631,44 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   return Match_Success;
 }
 
+// What asm variants we should check
+ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
+  if (getForcedEncodingSize() == 32) {
+    static const unsigned Variants[] = {AMDGPUAsmVariants::DEFAULT};
+    return makeArrayRef(Variants);
+  }
+
+  if (isForcedVOP3()) {
+    static const unsigned Variants[] = {AMDGPUAsmVariants::VOP3};
+    return makeArrayRef(Variants);
+  }
+
+  if (isForcedSDWA()) {
+    static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA};
+    return makeArrayRef(Variants);
+  }
+
+  if (isForcedDPP()) {
+    static const unsigned Variants[] = {AMDGPUAsmVariants::DPP};
+    return makeArrayRef(Variants);
+  }
+
+  static const unsigned Variants[] = {
+    AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
+    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP
+  };
+
+  return makeArrayRef(Variants);
+}
+
 bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               OperandVector &Operands,
                                               MCStreamer &Out,
                                               uint64_t &ErrorInfo,
                                               bool MatchingInlineAsm) {
-  // What asm variants we should check
-  std::vector<unsigned> MatchedVariants;
-  if (getForcedEncodingSize() == 32) {
-    MatchedVariants = {AMDGPUAsmVariants::DEFAULT};
-  } else if (isForcedVOP3()) {
-    MatchedVariants = {AMDGPUAsmVariants::VOP3};
-  } else if (isForcedSDWA()) {
-    MatchedVariants = {AMDGPUAsmVariants::SDWA};
-  } else if (isForcedDPP()) {
-    MatchedVariants = {AMDGPUAsmVariants::DPP};
-  } else {
-    MatchedVariants = {AMDGPUAsmVariants::DEFAULT,
-                       AMDGPUAsmVariants::VOP3,
-                       AMDGPUAsmVariants::SDWA,
-                       AMDGPUAsmVariants::DPP};
-  }
-
   MCInst Inst;
   unsigned Result = Match_Success;
-  for (auto Variant : MatchedVariants) {
+  for (auto Variant : getMatchedVariants()) {
     uint64_t EI;
     auto R = MatchInstructionImpl(Operands, Inst, EI, MatchingInlineAsm,
                                   Variant);
@@ -3486,7 +3500,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
   for (unsigned E = Operands.size(); I != E; ++I) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
     // Add the register arguments
-    if ((BasicInstType == SIInstrFlags::VOPC || 
+    if ((BasicInstType == SIInstrFlags::VOPC ||
          BasicInstType == SIInstrFlags::VOP2)&&
         Op.isReg() &&
         Op.Reg.RegNo == AMDGPU::VCC) {
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 89c9266746ac..de7ce5cb9e47 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -99,6 +99,18 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
 
   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+  // We need to include these since trunc STORES to PRIVATE need
+  // special handling to accommodate RMW
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom);
+  setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom);
+  setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);
 
   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
@@ -1087,79 +1099,114 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth,
 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
                                                    SelectionDAG &DAG) const {
   SDLoc DL(Store);
+  //TODO: Who creates the i8 stores?
+  assert(Store->isTruncatingStore()
+         || Store->getValue().getValueType() == MVT::i8);
+  assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
 
-  unsigned Mask = 0;
+  SDValue Mask;
   if (Store->getMemoryVT() == MVT::i8) {
-    Mask = 0xff;
+    assert(Store->getAlignment() >= 1);
+    Mask = DAG.getConstant(0xff, DL, MVT::i32);
   } else if (Store->getMemoryVT() == MVT::i16) {
-    Mask = 0xffff;
+    assert(Store->getAlignment() >= 2);
+    Mask = DAG.getConstant(0xffff, DL, MVT::i32);;
+  } else {
+    llvm_unreachable("Unsupported private trunc store");
   }
 
   SDValue Chain = Store->getChain();
   SDValue BasePtr = Store->getBasePtr();
+  SDValue Offset = Store->getOffset();
   EVT MemVT = Store->getMemoryVT();
 
-  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
-                            DAG.getConstant(2, DL, MVT::i32));
-  SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
-                            Chain, Ptr,
-                            DAG.getTargetConstant(0, DL, MVT::i32));
+  SDValue LoadPtr = BasePtr;
+  if (!Offset.isUndef()) {
+    LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
+  }
+
+  // Get dword location
+  // TODO: this should be eliminated by the future SHR ptr, 2
+  SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
+                            DAG.getConstant(0xfffffffc, DL, MVT::i32));
+
+  // Load dword
+  // TODO: can we be smarter about machine pointer info?
+  SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
 
-  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+  Chain = Dst.getValue(1);
+
+  // Get offset in dword
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
                                 DAG.getConstant(0x3, DL, MVT::i32));
 
+  // Convert byte offset to bit shift
   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
                                  DAG.getConstant(3, DL, MVT::i32));
 
+  // TODO: Contrary to the name of the functiom,
+  // it also handles sub i32 non-truncating stores (like i1)
   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
                                   Store->getValue());
 
+  // Mask the value to the right type
   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
 
+  // Shift the value in place
   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
                                      MaskedValue, ShiftAmt);
 
-  SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
-                                DAG.getConstant(Mask, DL, MVT::i32),
-                                ShiftAmt);
-  DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
-                        DAG.getConstant(0xffffffff, DL, MVT::i32));
+  // Shift the mask in place
+  SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt);
+
+  // Invert the mask. NOTE: if we had native ROL instructions we could
+  // use inverted mask
+  DstMask = DAG.getNOT(DL, DstMask, MVT::i32);
+
+  // Cleanup the target bits
   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
 
+  // Add the new bits
   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
-  return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
-                     Chain, Value, Ptr,
-                     DAG.getTargetConstant(0, DL, MVT::i32));
+
+  // Store dword
+  // TODO: Can we be smarter about MachinePointerInfo?
+  return DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());
 }
 
 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
   unsigned AS = StoreNode->getAddressSpace();
+
+  SDValue Chain = StoreNode->getChain();
+  SDValue Ptr = StoreNode->getBasePtr();
   SDValue Value = StoreNode->getValue();
-  EVT ValueVT = Value.getValueType();
+
+  EVT VT = Value.getValueType();
   EVT MemVT = StoreNode->getMemoryVT();
-  unsigned Align = StoreNode->getAlignment();
+  EVT PtrVT = Ptr.getValueType();
 
+  SDLoc DL(Op);
+
+  // Neither LOCAL nor PRIVATE can do vectors at the moment
   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
-      ValueVT.isVector()) {
-    return SplitVectorStore(Op, DAG);
+      VT.isVector()) {
+    return scalarizeVectorStore(StoreNode, DAG);
   }
 
-  // Private AS needs special fixes
-  if (Align < MemVT.getStoreSize() && (AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+  unsigned Align = StoreNode->getAlignment();
+  if (Align < MemVT.getStoreSize() &&
       !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
     return expandUnalignedStore(StoreNode, DAG);
   }
 
-  SDLoc DL(Op);
-  SDValue Chain = StoreNode->getChain();
-  SDValue Ptr = StoreNode->getBasePtr();
+  SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
+                                  DAG.getConstant(2, DL, PtrVT));
 
   if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
     // It is beneficial to create MSKOR here instead of combiner to avoid
     // artificial dependencies introduced by RMW
     if (StoreNode->isTruncatingStore()) {
-      EVT VT = Value.getValueType();
       assert(VT.bitsLE(MVT::i32));
       SDValue MaskConstant;
       if (MemVT == MVT::i8) {
@@ -1169,15 +1216,19 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
         assert(StoreNode->getAlignment() >= 2);
         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
       }
-      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
-                                      DAG.getConstant(2, DL, MVT::i32));
-      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
-                                      DAG.getConstant(0x00000003, DL, VT));
+
+      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr,
+                                      DAG.getConstant(0x00000003, DL, PtrVT));
+      SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+                                     DAG.getConstant(3, DL, VT));
+
+      // Put the mask in correct place
+      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
+
+      // Put the mask in correct place
       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
-      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
-                                   DAG.getConstant(3, DL, VT));
-      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
-      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);
+
       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
       // vector instead.
       SDValue Src[4] = {
@@ -1191,12 +1242,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
                                      Op->getVTList(), Args, MemVT,
                                      StoreNode->getMemOperand());
-    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
-               ValueVT.bitsGE(MVT::i32)) {
+    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {
       // Convert pointer from byte address to dword address.
-      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
-                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
-                                    Ptr, DAG.getConstant(2, DL, MVT::i32)));
+      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
 
       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
         llvm_unreachable("Truncated and indexed stores not supported yet");
@@ -1207,49 +1255,22 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
+  // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
   if (AS != AMDGPUAS::PRIVATE_ADDRESS)
     return SDValue();
 
   if (MemVT.bitsLT(MVT::i32))
     return lowerPrivateTruncStore(StoreNode, DAG);
 
-  // Lowering for indirect addressing
-  const MachineFunction &MF = DAG.getMachineFunction();
-  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
-  unsigned StackWidth = TFL->getStackWidth(MF);
-
-  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
-
-  if (ValueVT.isVector()) {
-    unsigned NumElemVT = ValueVT.getVectorNumElements();
-    EVT ElemVT = ValueVT.getVectorElementType();
-    SmallVector<SDValue, 4> Stores(NumElemVT);
-
-    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
-                                      "vector width in load");
-
-    for (unsigned i = 0; i < NumElemVT; ++i) {
-      unsigned Channel, PtrIncr;
-      getStackAddress(StackWidth, i, Channel, PtrIncr);
-      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
-                        DAG.getConstant(PtrIncr, DL, MVT::i32));
-      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
-                                 Value, DAG.getConstant(i, DL, MVT::i32));
-
-      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
-                              Chain, Elem, Ptr,
-                              DAG.getTargetConstant(Channel, DL, MVT::i32));
-    }
-     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
-   } else {
-    if (ValueVT == MVT::i8) {
-      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
-    }
-    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
-    DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
+  // Standard i32+ store, tag it with DWORDADDR to note that the address
+  // has been shifted
+  if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
+    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
+    return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
   }
 
-  return Chain;
+  // Tagged i32+ stores will be matched by patterns
+  return SDValue();
 }
 
 // return (512 + (kc_bank << 12)
@@ -1299,51 +1320,50 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
   LoadSDNode *Load = cast<LoadSDNode>(Op);
   ISD::LoadExtType ExtType = Load->getExtensionType();
   EVT MemVT = Load->getMemoryVT();
+  assert(Load->getAlignment() >= MemVT.getStoreSize());
 
-  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
-  // register (2-)byte extract.
+  SDValue BasePtr = Load->getBasePtr();
+  SDValue Chain = Load->getChain();
+  SDValue Offset = Load->getOffset();
 
-  // Get Register holding the target.
-  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
-                            DAG.getConstant(2, DL, MVT::i32));
-  // Load the Register.
-  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
-                            Load->getChain(),
-                            Ptr,
-                            DAG.getTargetConstant(0, DL, MVT::i32),
-                            Op.getOperand(2));
+  SDValue LoadPtr = BasePtr;
+  if (!Offset.isUndef()) {
+    LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
+  }
+
+  // Get dword location
+  // NOTE: this should be eliminated by the future SHR ptr, 2
+  SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
+                            DAG.getConstant(0xfffffffc, DL, MVT::i32));
+
+  // Load dword
+  // TODO: can we be smarter about machine pointer info?
+  SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
 
   // Get offset within the register.
   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
-                                Load->getBasePtr(),
-                                DAG.getConstant(0x3, DL, MVT::i32));
+                                LoadPtr, DAG.getConstant(0x3, DL, MVT::i32));
 
   // Bit offset of target byte (byteIdx * 8).
   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
                                  DAG.getConstant(3, DL, MVT::i32));
 
   // Shift to the right.
-  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+  SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);
 
   // Eliminate the upper bits by setting them to ...
   EVT MemEltVT = MemVT.getScalarType();
 
-  // ... ones.
-  if (ExtType == ISD::SEXTLOAD) {
+  if (ExtType == ISD::SEXTLOAD) { // ... ones.
     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
-
-    SDValue Ops[] = {
-      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
-      Load->getChain()
-    };
-
-    return DAG.getMergeValues(Ops, DL);
+    Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
+  } else { // ... or zeros.
+    Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
   }
 
-  // ... or zeros.
   SDValue Ops[] = {
-    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
-    Load->getChain()
+    Ret,
+    Read.getValue(1) // This should be our output chain
   };
 
   return DAG.getMergeValues(Ops, DL);
@@ -1365,12 +1385,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = LoadNode->getChain();
   SDValue Ptr = LoadNode->getBasePtr();
 
-  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
-    SDValue MergedValues[2] = {
-      scalarizeVectorLoad(LoadNode, DAG),
-      Chain
-    };
-    return DAG.getMergeValues(MergedValues, DL);
+  if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+      LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+      VT.isVector()) {
+      return scalarizeVectorLoad(LoadNode, DAG);
   }
 
   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
@@ -1421,8 +1439,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getMergeValues(MergedValues, DL);
   }
 
-  SDValue LoweredLoad;
-
   // For most operations returning SDValue() will result in the node being
   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
   // need to manually expand loads that may be legal in some address spaces and
@@ -1447,47 +1463,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return SDValue();
   }
 
-  // Lowering for indirect addressing
-  const MachineFunction &MF = DAG.getMachineFunction();
-  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
-  unsigned StackWidth = TFL->getStackWidth(MF);
-
-  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
-
-  if (VT.isVector()) {
-    unsigned NumElemVT = VT.getVectorNumElements();
-    EVT ElemVT = VT.getVectorElementType();
-    SDValue Loads[4];
-
-    assert(NumElemVT <= 4);
-    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
-                                      "vector width in load");
-
-    for (unsigned i = 0; i < NumElemVT; ++i) {
-      unsigned Channel, PtrIncr;
-      getStackAddress(StackWidth, i, Channel, PtrIncr);
-      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
-                        DAG.getConstant(PtrIncr, DL, MVT::i32));
-      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
-                             Chain, Ptr,
-                             DAG.getTargetConstant(Channel, DL, MVT::i32),
-                             Op.getOperand(2));
-    }
-    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
-    LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
-  } else {
-    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
-                              Chain, Ptr,
-                              DAG.getTargetConstant(0, DL, MVT::i32), // Channel
-                              Op.getOperand(2));
+  // DWORDADDR ISD marks already shifted address
+  if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
+    assert(VT == MVT::i32);
+    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32));
+    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr);
+    return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());
   }
-
-  SDValue Ops[2] = {
-    LoweredLoad,
-    Chain
-  };
-
-  return DAG.getMergeValues(Ops, DL);
+  return SDValue();
 }
 
 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 3a72e0791fd6..19795bdde647 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -1268,6 +1268,17 @@ let Predicates = [isR600] in {
 
 defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
 
+// Hardcode channel to 0
+// NOTE: LSHR is not available here. LSHR is per family instruction
+def : Pat <
+  (i32 (load_private ADDRIndirect:$addr) ),
+  (R600_RegisterLoad FRAMEri:$addr, (i32 0))
+>;
+def : Pat <
+  (store_private i32:$val, ADDRIndirect:$addr),
+  (R600_RegisterStore i32:$val, FRAMEri:$addr, (i32 0))
+>;
+
 
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index c78e97dfd46f..9140fe6cd148 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -99,6 +99,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
   setOperationAction(ISD::STORE, MVT::i1, Custom);
 
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+  setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
+  setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
+
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
@@ -699,7 +711,8 @@ SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
 
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          const SDLoc &SL, SDValue Chain,
-                                         unsigned Offset, bool Signed) const {
+                                         unsigned Offset, bool Signed,
+                                         const ISD::InputArg *Arg) const {
   const DataLayout &DL = DAG.getDataLayout();
   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
@@ -713,20 +726,21 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                              MachineMemOperand::MODereferenceable |
                              MachineMemOperand::MOInvariant);
 
-  SDValue Val;
+  SDValue Val = Load;
+  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
+      VT.bitsLT(MemVT)) {
+    unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
+    Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
+  }
+
   if (MemVT.isFloatingPoint())
-    Val = getFPExtOrFPTrunc(DAG, Load, SL, VT);
+    Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
   else if (Signed)
-    Val = DAG.getSExtOrTrunc(Load, SL, VT);
+    Val = DAG.getSExtOrTrunc(Val, SL, VT);
   else
-    Val = DAG.getZExtOrTrunc(Load, SL, VT);
-
-  SDValue Ops[] = {
-    Val,
-    Load.getValue(1)
-  };
+    Val = DAG.getZExtOrTrunc(Val, SL, VT);
 
-  return DAG.getMergeValues(Ops, SL);
+  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
@@ -899,7 +913,8 @@ SDValue SITargetLowering::LowerFormalArguments(
       // The first 36 bytes of the input buffer contains information about
       // thread group and global sizes.
       SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
-                                   Offset, Ins[i].Flags.isSExt());
+                                   Offset, Ins[i].Flags.isSExt(),
+                                   &Ins[i]);
       Chains.push_back(Arg.getValue(1));
 
       auto *ParamTy =
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 9583f6db6faa..6c04e4f30977 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -24,7 +24,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
                             unsigned Offset) const;
   SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
-                         SDValue Chain, unsigned Offset, bool Signed) const;
+                         SDValue Chain, unsigned Offset, bool Signed,
+                         const ISD::InputArg *Arg = nullptr) const;
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 156a21dfecfe..462a7d57d2de 100644
--- a/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -203,8 +203,8 @@ unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD,
 bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
                                                    unsigned ConstraintCode,
                                                    std::vector<SDValue> &OutOps) {
-  assert(ConstraintCode == InlineAsm::Constraint_m ||
-         ConstraintCode == InlineAsm::Constraint_Q &&
+  assert((ConstraintCode == InlineAsm::Constraint_m ||
+         ConstraintCode == InlineAsm::Constraint_Q) &&
       "Unexpected asm memory constraint");
 
   MachineRegisterInfo &RI = MF->getRegInfo();
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index 53668f05b59b..07fc3f6890b8 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -14,6 +14,7 @@
 
 #include "AVRISelLowering.h"
 
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -1933,5 +1934,45 @@ void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
+unsigned AVRTargetLowering::getRegisterByName(const char *RegName,
+                                              EVT VT,
+                                              SelectionDAG &DAG) const {
+  unsigned Reg;
+
+  if (VT == MVT::i8) {
+    Reg = StringSwitch<unsigned>(RegName)
+      .Case("r0", AVR::R0).Case("r1", AVR::R1).Case("r2", AVR::R2)
+      .Case("r3", AVR::R3).Case("r4", AVR::R4).Case("r5", AVR::R5)
+      .Case("r6", AVR::R6).Case("r7", AVR::R7).Case("r8", AVR::R8)
+      .Case("r9", AVR::R9).Case("r10", AVR::R10).Case("r11", AVR::R11)
+      .Case("r12", AVR::R12).Case("r13", AVR::R13).Case("r14", AVR::R14)
+      .Case("r15", AVR::R15).Case("r16", AVR::R16).Case("r17", AVR::R17)
+      .Case("r18", AVR::R18).Case("r19", AVR::R19).Case("r20", AVR::R20)
+      .Case("r21", AVR::R21).Case("r22", AVR::R22).Case("r23", AVR::R23)
+      .Case("r24", AVR::R24).Case("r25", AVR::R25).Case("r26", AVR::R26)
+      .Case("r27", AVR::R27).Case("r28", AVR::R28).Case("r29", AVR::R29)
+      .Case("r30", AVR::R30).Case("r31", AVR::R31)
+      .Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
+      .Default(0);
+  } else {
+    Reg = StringSwitch<unsigned>(RegName)
+      .Case("r0", AVR::R1R0).Case("r2", AVR::R3R2)
+      .Case("r4", AVR::R5R4).Case("r6", AVR::R7R6)
+      .Case("r8", AVR::R9R8).Case("r10", AVR::R11R10)
+      .Case("r12", AVR::R13R12).Case("r14", AVR::R15R14)
+      .Case("r16", AVR::R17R16).Case("r18", AVR::R19R18)
+      .Case("r20", AVR::R21R20).Case("r22", AVR::R23R22)
+      .Case("r24", AVR::R25R24).Case("r26", AVR::R27R26)
+      .Case("r28", AVR::R29R28).Case("r30", AVR::R31R30)
+      .Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
+      .Default(0);
+  }
+
+  if (Reg)
+    return Reg;
+
+  report_fatal_error("Invalid register name global variable");
+}
+
 } // end of namespace llvm
 
diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h
index 17074e1b1eee..a8cdc4e7ae23 100644
--- a/lib/Target/AVR/AVRISelLowering.h
+++ b/lib/Target/AVR/AVRISelLowering.h
@@ -116,6 +116,9 @@ public:
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
+  unsigned getRegisterByName(const char* RegName, EVT VT,
+                             SelectionDAG &DAG) const override;
+
 private:
   SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
                     SelectionDAG &DAG, SDLoc dl) const;
diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp
index cbe4466164f9..e38facead922 100644
--- a/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/lib/Target/BPF/BPFInstrInfo.cpp
@@ -13,15 +13,13 @@
 
 #include "BPF.h"
 #include "BPFInstrInfo.h"
-#include "BPFSubtarget.h"
-#include "BPFTargetMachine.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
+#include <cassert>
+#include <iterator>
 
 #define GET_INSTRINFO_CTOR_DTOR
 #include "BPFGenInstrInfo.inc"
@@ -109,11 +107,11 @@ bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       while (std::next(I) != MBB.end())
         std::next(I)->eraseFromParent();
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // Delete the J if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         continue;
diff --git a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index b0037fbc16ac..9beefcdcc1d5 100644
--- a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -12,16 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "BPF.h"
-#include "BPFRegisterInfo.h"
 #include "BPFSubtarget.h"
 #include "MCTargetDesc/BPFMCTargetDesc.h"
-
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cstdint>
 
 using namespace llvm;
 
@@ -36,14 +35,15 @@ class BPFDisassembler : public MCDisassembler {
 public:
   BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
-  virtual ~BPFDisassembler() {}
+  ~BPFDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
                               raw_ostream &CStream) const override;
 };
-}
+
+} // end anonymous namespace
 
 static MCDisassembler *createBPFDisassembler(const Target &T,
                                              const MCSubtargetInfo &STI,
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index a6cd2002c12c..afc321ea2c34 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -8,28 +8,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
+
 class BPFAsmBackend : public MCAsmBackend {
 public:
   bool IsLittleEndian;
 
   BPFAsmBackend(bool IsLittleEndian)
     : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
-  ~BPFAsmBackend() override {}
+  ~BPFAsmBackend() override = default;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                   uint64_t Value, bool IsPCRel) const override;
@@ -53,6 +49,8 @@ public:
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 };
 
+} // end anonymous namespace
+
 bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   if ((Count % 8) != 0)
     return false;
@@ -66,7 +64,6 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                unsigned DataSize, uint64_t Value,
                                bool IsPCRel) const {
-
   if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
     assert(Value == 0);
   } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
@@ -92,7 +89,6 @@ void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 MCObjectWriter *BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
   return createBPFELFObjectWriter(OS, 0, IsLittleEndian);
 }
-}
 
 MCAsmBackend *llvm::createBPFAsmBackend(const Target &T,
                                         const MCRegisterInfo &MRI,
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 3d1c0eb55afa..ebe9abd8ffac 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -10,29 +10,30 @@
 #include "MCTargetDesc/BPFMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
+
 class BPFELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   BPFELFObjectWriter(uint8_t OSABI);
-
-  ~BPFELFObjectWriter() override;
+  ~BPFELFObjectWriter() override = default;
 
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
 };
-}
+
+} // end anonymous namespace
 
 BPFELFObjectWriter::BPFELFObjectWriter(uint8_t OSABI)
     : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_BPF,
                               /*HasRelocationAddend*/ false) {}
 
-BPFELFObjectWriter::~BPFELFObjectWriter() {}
-
 unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index 47f16512a397..e8c974479828 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -12,24 +12,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
 namespace {
+
 class BPFMCCodeEmitter : public MCCodeEmitter {
-  BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
-  void operator=(const BPFMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
   bool IsLittleEndian;
@@ -38,8 +39,9 @@ public:
   BPFMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
                    bool IsLittleEndian)
       : MCII(mcii), MRI(mri), IsLittleEndian(IsLittleEndian) {}
-
-  ~BPFMCCodeEmitter() {}
+  BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
+  void operator=(const BPFMCCodeEmitter &) = delete;
+  ~BPFMCCodeEmitter() override = default;
 
   // getBinaryCodeForInstr - TableGen'erated function for getting the
   // binary encoding for an instruction.
@@ -66,7 +68,8 @@ private:
   void verifyInstructionPredicates(const MCInst &MI,
                                    uint64_t AvailableFeatures) const;
 };
-}
+
+} // end anonymous namespace
 
 MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII,
                                             const MCRegisterInfo &MRI,
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 55415f97396b..b58409730de0 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -12,14 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "BPF.h"
-#include "BPFMCTargetDesc.h"
-#include "BPFMCAsmInfo.h"
 #include "InstPrinter/BPFInstPrinter.h"
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "MCTargetDesc/BPFMCAsmInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -64,7 +63,7 @@ static MCInstPrinter *createBPFMCInstPrinter(const Triple &T,
                                              const MCRegisterInfo &MRI) {
   if (SyntaxVariant == 0)
     return new BPFInstPrinter(MAI, MII, MRI);
-  return 0;
+  return nullptr;
 }
 
 extern "C" void LLVMInitializeBPFTargetMC() {
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 5fb5b0227800..df12e0e88e3b 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -101,7 +101,7 @@ LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
 }
 
 LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
-        const char* Triple, const char* CPU, const char* Features,
+        const char *Triple, const char *CPU, const char *Features,
         LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
         LLVMCodeModel CodeModel) {
   Optional<Reloc::Model> RM;
@@ -139,7 +139,7 @@ LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
 
   TargetOptions opt;
   return wrap(unwrap(T)->createTargetMachine(Triple, CPU, Features, opt, RM,
-    CM, OL));
+                                             CM, OL));
 }
 
 void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); }
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index f4d46383e5bb..d9c53ecc8d08 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyExplicitLocals.cpp
   WebAssemblyFastISel.cpp
   WebAssemblyFixIrreducibleControlFlow.cpp
+  WebAssemblyFixFunctionBitcasts.cpp
   WebAssemblyFrameLowering.cpp
   WebAssemblyISelDAGToDAG.cpp
   WebAssemblyISelLowering.cpp
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 09c35b4825fc..8738263ad847 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -28,6 +28,7 @@ class FunctionPass;
 // LLVM IR passes.
 ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool DoEH, bool DoSjLj);
 void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &);
+ModulePass *createWebAssemblyFixFunctionBitcasts();
 FunctionPass *createWebAssemblyOptimizeReturned();
 
 // ISel and immediate followup passes.
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
new file mode 100644
index 000000000000..d5474a02ce01
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -0,0 +1,159 @@
+//===-- WebAssemblyFixFunctionBitcasts.cpp - Fix function bitcasts --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Fix bitcasted functions.
+///
+/// WebAssembly requires caller and callee signatures to match, however in LLVM,
+/// some amount of slop is vaguely permitted. Detect mismatch by looking for
+/// bitcasts of functions and rewrite them to use wrapper functions instead.
+///
+/// This doesn't catch all cases, such as when a function's address is taken in
+/// one place and casted in another, but it works for many common cases.
+///
+/// Note that LLVM already optimizes away function bitcasts in common cases by
+/// dropping arguments as needed, so this pass only ends up getting used in less
+/// common cases.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fix-function-bitcasts"
+
+namespace {
+class FixFunctionBitcasts final : public ModulePass {
+  StringRef getPassName() const override {
+    return "WebAssembly Fix Function Bitcasts";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    ModulePass::getAnalysisUsage(AU);
+  }
+
+  bool runOnModule(Module &M) override;
+
+public:
+  static char ID;
+  FixFunctionBitcasts() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char FixFunctionBitcasts::ID = 0;
+ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() {
+  return new FixFunctionBitcasts();
+}
+
+// Recursively descend the def-use lists from V to find non-bitcast users of
+// bitcasts of V.
+static void FindUses(Value *V, Function &F,
+                     SmallVectorImpl<std::pair<Use *, Function *>> &Uses) {
+  for (Use &U : V->uses()) {
+    if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser()))
+      FindUses(BC, F, Uses);
+    else if (U.get()->getType() != F.getType())
+      Uses.push_back(std::make_pair(&U, &F));
+  }
+}
+
+// Create a wrapper function with type Ty that calls F (which may have a
+// different type). Attempt to support common bitcasted function idioms:
+//  - Call with more arguments than needed: arguments are dropped
+//  - Call with fewer arguments than needed: arguments are filled in with undef
+//  - Return value is not needed: drop it
+//  - Return value needed but not present: supply an undef
+//  
+// For now, return nullptr without creating a wrapper if the wrapper cannot
+// be generated due to incompatible types.
+static Function *CreateWrapper(Function *F, FunctionType *Ty) {
+  Module *M = F->getParent();
+
+  Function *Wrapper =
+      Function::Create(Ty, Function::PrivateLinkage, "bitcast", M);
+  BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
+
+  // Determine what arguments to pass.
+  SmallVector<Value *, 4> Args;
+  Function::arg_iterator AI = Wrapper->arg_begin();
+  FunctionType::param_iterator PI = F->getFunctionType()->param_begin();
+  FunctionType::param_iterator PE = F->getFunctionType()->param_end();
+  for (; AI != Wrapper->arg_end() && PI != PE; ++AI, ++PI) {
+    if (AI->getType() != *PI) {
+      Wrapper->eraseFromParent();
+      return nullptr;
+    }
+    Args.push_back(&*AI);
+  }
+  for (; PI != PE; ++PI)
+    Args.push_back(UndefValue::get(*PI));
+
+  CallInst *Call = CallInst::Create(F, Args, "", BB);
+
+  // Determine what value to return.
+  if (Ty->getReturnType()->isVoidTy())
+    ReturnInst::Create(M->getContext(), BB);
+  else if (F->getFunctionType()->getReturnType()->isVoidTy())
+    ReturnInst::Create(M->getContext(), UndefValue::get(Ty->getReturnType()),
+                       BB);
+  else if (F->getFunctionType()->getReturnType() == Ty->getReturnType())
+    ReturnInst::Create(M->getContext(), Call, BB);
+  else {
+    Wrapper->eraseFromParent();
+    return nullptr;
+  }
+
+  return Wrapper;
+}
+
+bool FixFunctionBitcasts::runOnModule(Module &M) {
+  SmallVector<std::pair<Use *, Function *>, 0> Uses;
+
+  // Collect all the places that need wrappers.
+  for (Function &F : M)
+    FindUses(&F, F, Uses);
+
+  DenseMap<std::pair<Function *, FunctionType *>, Function *> Wrappers;
+
+  for (auto &UseFunc : Uses) {
+    Use *U = UseFunc.first;
+    Function *F = UseFunc.second;
+    PointerType *PTy = cast<PointerType>(U->get()->getType());
+    FunctionType *Ty = dyn_cast<FunctionType>(PTy->getElementType());
+
+    // If the function is casted to something like i8* as a "generic pointer"
+    // to be later casted to something else, we can't generate a wrapper for it.
+    // Just ignore such casts for now.
+    if (!Ty)
+      continue;
+
+    auto Pair = Wrappers.insert(std::make_pair(std::make_pair(F, Ty), nullptr));
+    if (Pair.second)
+      Pair.first->second = CreateWrapper(F, Ty);
+
+    Function *Wrapper = Pair.first->second;
+    if (!Wrapper)
+      continue;
+
+    if (isa<Constant>(U->get()))
+      U->get()->replaceAllUsesWith(Wrapper);
+    else
+      U->set(Wrapper);
+  }
+
+  return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 8a3248ee669e..e872dc219846 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -40,8 +40,8 @@ defm ROTL : BinaryInt<rotl, "rotl", 0x77, 0x89>;
 defm ROTR : BinaryInt<rotr, "rotr", 0x78, 0x8a>;
 
 let isCommutable = 1 in {
-defm EQ : ComparisonInt<SETEQ, "eq  ", 0x46, 0x68>;
-defm NE : ComparisonInt<SETNE, "ne  ", 0x47, 0x69>;
+defm EQ : ComparisonInt<SETEQ, "eq  ", 0x46, 0x51>;
+defm NE : ComparisonInt<SETNE, "ne  ", 0x47, 0x52>;
 } // isCommutable = 1
 defm LT_S : ComparisonInt<SETLT,  "lt_s", 0x48, 0x53>;
 defm LT_U : ComparisonInt<SETULT, "lt_u", 0x49, 0x54>;
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b61bc0a08143..f5ef35a2ad40 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -163,6 +163,10 @@ void WebAssemblyPassConfig::addIRPasses() {
     // control specifically what gets lowered.
     addPass(createAtomicExpandPass(TM));
 
+  // Fix function bitcasts, as WebAssembly requires caller and callee signatures
+  // to match.
+  addPass(createWebAssemblyFixFunctionBitcasts());
+
   // Optimize "returned" function attributes.
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createWebAssemblyOptimizeReturned());
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7f72ab17f619..db76ddf04c06 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6962,23 +6962,24 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
 }
 
-/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
-/// node.
-static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
-                             const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+/// Returns true iff \p BV builds a vector with the result equivalent to
+/// the result of ADDSUB operation.
+/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
+/// are written to the parameters \p Opnd0 and \p Opnd1.
+static bool isAddSub(const BuildVectorSDNode *BV,
+                     const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                     SDValue &Opnd0, SDValue &Opnd1) {
+
   MVT VT = BV->getSimpleValueType(0);
   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
-      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
-    return SDValue();
+      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
+      (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+    return false;
 
-  SDLoc DL(BV);
   unsigned NumElts = VT.getVectorNumElements();
   SDValue InVec0 = DAG.getUNDEF(VT);
   SDValue InVec1 = DAG.getUNDEF(VT);
 
-  assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
-          VT == MVT::v2f64) && "build_vector with an invalid type found!");
-
   // Odd-numbered elements in the input build vector are obtained from
   // adding two integer/float elements.
   // Even-numbered elements in the input build vector are obtained from
@@ -7000,7 +7001,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
 
     // Early exit if we found an unexpected opcode.
     if (Opcode != ExpectedOpcode)
-      return SDValue();
+      return false;
 
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
@@ -7013,11 +7014,11 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
         Op0.getOperand(1) != Op1.getOperand(1))
-      return SDValue();
+      return false;
 
     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
     if (I0 != i)
-      return SDValue();
+      return false;
 
     // We found a valid add/sub node. Update the information accordingly.
     if (i & 1)
@@ -7029,39 +7030,118 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
     if (InVec0.isUndef()) {
       InVec0 = Op0.getOperand(0);
       if (InVec0.getSimpleValueType() != VT)
-        return SDValue();
+        return false;
     }
     if (InVec1.isUndef()) {
       InVec1 = Op1.getOperand(0);
       if (InVec1.getSimpleValueType() != VT)
-        return SDValue();
+        return false;
     }
 
     // Make sure that operands in input to each add/sub node always
     // come from a same pair of vectors.
     if (InVec0 != Op0.getOperand(0)) {
       if (ExpectedOpcode == ISD::FSUB)
-        return SDValue();
+        return false;
 
       // FADD is commutable. Try to commute the operands
       // and then test again.
       std::swap(Op0, Op1);
       if (InVec0 != Op0.getOperand(0))
-        return SDValue();
+        return false;
     }
 
     if (InVec1 != Op1.getOperand(0))
-      return SDValue();
+      return false;
 
     // Update the pair of expected opcodes.
     std::swap(ExpectedOpcode, NextExpectedOpcode);
   }
 
   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
-  if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
-    return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
+  if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
+    return false;
 
-  return SDValue();
+  Opnd0 = InVec0;
+  Opnd1 = InVec1;
+  return true;
+}
+
+/// Returns true if is possible to fold MUL and an idiom that has already been
+/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
+/// If (and only if) true is returned, the operands of FMADDSUB are written to
+/// parameters \p Opnd0, \p Opnd1, \p Opnd2.
+///
+/// Prior to calling this function it should be known that there is some
+/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
+/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
+/// before replacement of such SDNode with ADDSUB operation. Thus the number
+/// of \p Opnd0 uses is expected to be equal to 2.
+/// For example, this function may be called for the following IR:
+///    %AB = fmul fast <2 x double> %A, %B
+///    %Sub = fsub fast <2 x double> %AB, %C
+///    %Add = fadd fast <2 x double> %AB, %C
+///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
+///                            <2 x i32> <i32 0, i32 3>
+/// There is a def for %Addsub here, which potentially can be replaced by
+/// X86ISD::ADDSUB operation:
+///    %Addsub = X86ISD::ADDSUB %AB, %C
+/// and such ADDSUB can further be replaced with FMADDSUB:
+///    %Addsub = FMADDSUB %A, %B, %C.
+///
+/// The main reason why this method is called before the replacement of the
+/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
+/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
+/// FMADDSUB is.
+static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                       SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
+  if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
+      !Subtarget.hasAnyFMA())
+    return false;
+
+  // FIXME: These checks must match the similar ones in
+  // DAGCombiner::visitFADDForFMACombine. It would be good to have one
+  // function that would answer if it is Ok to fuse MUL + ADD to FMADD
+  // or MUL + ADDSUB to FMADDSUB.
+  const TargetOptions &Options = DAG.getTarget().Options;
+  bool AllowFusion =
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+  if (!AllowFusion)
+    return false;
+
+  Opnd2 = Opnd1;
+  Opnd1 = Opnd0.getOperand(1);
+  Opnd0 = Opnd0.getOperand(0);
+
+  return true;
+}
+
+/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
+/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
+static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  SDValue Opnd0, Opnd1;
+  if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
+    return SDValue();
+
+  MVT VT = BV->getSimpleValueType(0);
+  SDLoc DL(BV);
+
+  // Try to generate X86ISD::FMADDSUB node here.
+  SDValue Opnd2;
+  if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
+    return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+
+  // Do not generate X86ISD::ADDSUB node for 512-bit types even though
+  // the ADDSUB idiom has been successfully recognized. There are no known
+  // X86 targets with 512-bit ADDSUB instructions!
+  // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
+  // recognition.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
@@ -7290,7 +7370,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return VectorConstant;
 
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
-  if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
+  if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
     return AddSub;
   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
     return HorizontalOp;
@@ -12965,6 +13045,12 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (Subtarget.hasVBMI())
     return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
 
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // the results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
 }
@@ -16985,9 +17071,16 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
   }
 
-  if (Cond.getOpcode() == ISD::SETCC)
-    if (SDValue NewCond = LowerSETCC(Cond, DAG))
+  if (Cond.getOpcode() == ISD::SETCC) {
+    if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
       Cond = NewCond;
+      // If the condition was updated, it's possible that the operands of the
+      // select were also updated (for example, EmitTest has a RAUW). Refresh
+      // the local references to the select operands in case they got stale.
+      Op1 = Op.getOperand(1);
+      Op2 = Op.getOperand(2);
+    }
+  }
 
   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
@@ -17193,22 +17286,26 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
   if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
     return SDValue();
 
-  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+  if (VT.is512BitVector() && InVTElt != MVT::i1) {
     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   }
 
-  assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+  assert (InVTElt == MVT::i1 && "Unexpected vector type");
   MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
-  SDValue NegOne = DAG.getConstant(
-      APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
-  SDValue Zero = DAG.getConstant(
-      APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+  SDValue V;
+  if (Subtarget.hasDQI()) {
+    V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
+    assert(!VT.is512BitVector() && "Unexpected vector type");
+  } else {
+    SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
+    SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
+    V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+    if (VT.is512BitVector())
+      return V;
+  }
 
-  SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
-  if (VT.is512BitVector())
-    return V;
   return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
 }
 
@@ -21528,6 +21625,23 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
   }
 
+  // It's worth extending once and using the vXi16/vXi32 shifts for smaller
+  // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
+  // make the existing SSE solution better.
+  if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
+      (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
+      (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
+      (Subtarget.hasBWI() && VT == MVT::v32i8)) {
+    MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
+    MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
+    unsigned ExtOpc =
+        Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    R = DAG.getNode(ExtOpc, dl, ExtVT, R);
+    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
+  }
+
   if (VT == MVT::v16i8 ||
       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
@@ -21636,19 +21750,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
-  // It's worth extending once and using the v8i32 shifts for 16-bit types, but
-  // the extra overheads to get from v16i8 to v8i32 make the existing SSE
-  // solution better.
-  if (Subtarget.hasInt256() && VT == MVT::v8i16) {
-    MVT ExtVT = MVT::v8i32;
-    unsigned ExtOpc =
-        Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-    R = DAG.getNode(ExtOpc, dl, ExtVT, R);
-    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
-    return DAG.getNode(ISD::TRUNCATE, dl, VT,
-                       DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
-  }
-
   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
     MVT ExtVT = MVT::v8i32;
     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
@@ -27763,29 +27864,32 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// \brief Try to combine a shuffle into a target-specific add-sub node.
+/// Returns true iff the shuffle node \p N can be replaced with ADDSUB
+/// operation. If true is returned then the operands of ADDSUB operation
+/// are written to the parameters \p Opnd0 and \p Opnd1.
 ///
-/// We combine this directly on the abstract vector shuffle nodes so it is
-/// easier to generically match. We also insert dummy vector shuffle nodes for
-/// the operands which explicitly discard the lanes which are unused by this
-/// operation to try to flow through the rest of the combiner the fact that
-/// they're unused.
-static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
-                                      SelectionDAG &DAG) {
-  SDLoc DL(N);
+/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
+/// so it is easier to generically match. We also insert dummy vector shuffle
+/// nodes for the operands which explicitly discard the lanes which are unused
+/// by this operation to try to flow through the rest of the combiner
+/// the fact that they're unused.
+static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
+                     SDValue &Opnd0, SDValue &Opnd1) {
+
   EVT VT = N->getValueType(0);
   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
-      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
-    return SDValue();
+      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
+      (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+    return false;
 
   // We only handle target-independent shuffles.
   // FIXME: It would be easy and harmless to use the target shuffle mask
   // extraction tool to support more.
   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
-    return SDValue();
+    return false;
 
   ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
-  SmallVector<int, 8> Mask(OrigMask.begin(), OrigMask.end());
+  SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
 
   SDValue V1 = N->getOperand(0);
   SDValue V2 = N->getOperand(1);
@@ -27796,27 +27900,57 @@ static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
     ShuffleVectorSDNode::commuteMask(Mask);
     std::swap(V1, V2);
   } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
-    return SDValue();
+    return false;
 
   // If there are other uses of these operations we can't fold them.
   if (!V1->hasOneUse() || !V2->hasOneUse())
-    return SDValue();
+    return false;
 
   // Ensure that both operations have the same operands. Note that we can
   // commute the FADD operands.
   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
-    return SDValue();
+    return false;
 
   // We're looking for blends between FADD and FSUB nodes. We insist on these
   // nodes being lined up in a specific expected pattern.
   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
-        isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
+        isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
+        isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
+                                           8, 25, 10, 27, 12, 29, 14, 31})))
+    return false;
+
+  Opnd0 = LHS;
+  Opnd1 = RHS;
+  return true;
+}
+
+/// \brief Try to combine a shuffle into a target-specific add-sub or
+/// mul-add-sub node.
+static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
+                                                const X86Subtarget &Subtarget,
+                                                SelectionDAG &DAG) {
+  SDValue Opnd0, Opnd1;
+  if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Try to generate X86ISD::FMADDSUB node here.
+  SDValue Opnd2;
+  if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
+    return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+
+  // Do not generate X86ISD::ADDSUB node for 512-bit types even though
+  // the ADDSUB idiom has been successfully recognized. There are no known
+  // X86 targets with 512-bit ADDSUB instructions!
+  if (VT.is512BitVector())
     return SDValue();
 
-  return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
+  return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
 // We are looking for a shuffle where both sources are concatenated with undef
@@ -27878,7 +28012,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   // If we have legalized the vector types, look for blends of FADD and FSUB
   // nodes that we can fuse into an ADDSUB node.
   if (TLI.isTypeLegal(VT))
-    if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
+    if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
       return AddSub;
 
   // During Type Legalization, when promoting illegal vector types,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 908053e1342d..d44d1395f243 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -443,6 +443,22 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
                [(set VR512:$dst, (v16i32 immAllOnesV))]>;
 }
 
+// Alias instructions that allow VPTERNLOG to be used with a mask to create
+// a mix of all ones and all zeros elements. This is done this way to force
+// the same register to be used as input for all three sources.
+let isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst),
+                                (ins VK16WM:$mask), "",
+                           [(set VR512:$dst, (vselect (v16i1 VK16WM:$mask),
+                                                      (v16i32 immAllOnesV),
+                                                      (v16i32 immAllZerosV)))]>;
+def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
+                                (ins VK8WM:$mask), "",
+                [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask),
+                                           (bc_v8i64 (v16i32 immAllOnesV)),
+                                           (bc_v8i64 (v16i32 immAllZerosV))))]>;
+}
+
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in {
 def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
@@ -1064,10 +1080,10 @@ def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
                            (v8f32 VR256X:$src), 1)>;
 def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
           (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v4f64 VR256X:$src), 1)>; 
+                           (v4f64 VR256X:$src), 1)>;
 def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
           (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v4i64 VR256X:$src), 1)>; 
+                           (v4i64 VR256X:$src), 1)>;
 def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
           (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
                            (v8i32 VR256X:$src), 1)>;
@@ -1485,8 +1501,7 @@ defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
 // AVX-512 - BLEND using mask
 //
 multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in {
-  let hasSideEffects = 0 in
+  let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
   def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
@@ -1496,16 +1511,13 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             [(set _.RC:$dst, (vselect _.KRCWM:$mask,
-                                (_.VT _.RC:$src2),
-                                (_.VT _.RC:$src1)))]>, EVEX_4V, EVEX_K;
-  let hasSideEffects = 0 in
+             []>, EVEX_4V, EVEX_K;
   def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
              []>, EVEX_4V, EVEX_KZ;
-  let mayLoad = 1, hasSideEffects = 0 in
+  let mayLoad = 1 in {
   def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
@@ -1515,38 +1527,32 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             [(set _.RC:$dst, (vselect _.KRCWM:$mask,
-                                 (_.VT (bitconvert (_.LdFrag addr:$src2))),
-                                 (_.VT _.RC:$src1)))]>,
-              EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
-  let mayLoad = 1, hasSideEffects = 0 in
+             []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
   def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
              []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
   }
+  }
 }
 multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
 
+  let mayLoad = 1, hasSideEffects = 0 in {
   def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
       (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
             "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
-      [(set _.RC:$dst,(vselect _.KRCWM:$mask,
-                        (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
-                        (_.VT _.RC:$src1)))]>,
-      EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+      []>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
 
-  let mayLoad = 1, hasSideEffects = 0 in
   def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
       (ins _.RC:$src1, _.ScalarMemOp:$src2),
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
             "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
       []>,  EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
-
+  }
 }
 
 multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
@@ -1582,21 +1588,6 @@ defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
 defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
 
 
-let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
-                            (v8f32 VR256X:$src2))),
-            (EXTRACT_SUBREG
-              (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
-            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>;
-
-def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
-                            (v8i32 VR256X:$src2))),
-            (EXTRACT_SUBREG
-                (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>;
-}
 //===----------------------------------------------------------------------===//
 // Compare Instructions
 //===----------------------------------------------------------------------===//
@@ -2735,7 +2726,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                       (ins _.KRCWM:$mask,  _.RC:$src),
                       !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
                        "${dst} {${mask}} {z}, $src}"),
-                       [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+                       [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
                                            (_.VT _.RC:$src),
                                            _.ImmAllZerosV)))], _.ExeDomain>,
                        EVEX, EVEX_KZ;
@@ -2972,6 +2963,30 @@ def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
                            (v16i32 VR512:$src))),
                   (VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;
 
+// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
+// available. Use a 512-bit operation and extract.
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+                          (v8f32 VR256X:$src0))),
+          (EXTRACT_SUBREG
+           (v16f32
+            (VMOVAPSZrrk
+             (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
+             (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+             (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
+           sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+                          (v8i32 VR256X:$src0))),
+          (EXTRACT_SUBREG
+           (v16i32
+            (VMOVDQA32Zrrk
+             (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
+             (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+             (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
+           sub_ymm)>;
+}
+
 let Predicates = [HasVLX, NoBWI] in {
   // 128-bit load/store without BWI.
   def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
@@ -3116,13 +3131,13 @@ let Predicates = [HasVLX] in {
      (VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
 }
 
-
-// Move Int Doubleword to Packed Double Int
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
-                      "vmovd\t{$src, $dst|$dst, $src}",
-                      [(set VR128X:$dst,
+
+// Move Int Doubleword to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
                         EVEX;
 def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
@@ -3152,47 +3167,47 @@ def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src
 def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
                          "vmovq\t{$src, $dst|$dst, $src}",
                          [(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
-                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
-                         EVEX_CD8<64, CD8VT1>;
-}
-} // ExeDomain = SSEPackedInt
-
-// Move Int Doubleword to Single Scalar
-//
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
-def VMOVDI2SSZrr  : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
-                      "vmovd\t{$src, $dst|$dst, $src}",
-                      [(set FR32X:$dst, (bitconvert GR32:$src))],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+                         EVEX_CD8<64, CD8VT1>;
+}
+} // ExeDomain = SSEPackedInt
+
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr  : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert GR32:$src))],
                       IIC_SSE_MOVDQ>, EVEX;
 
 def VMOVDI2SSZrm  : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
-                      "vmovd\t{$src, $dst|$dst, $src}",
-                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
-
-// Move doubleword from xmm register to r/m32
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVPDI2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
-                       "vmovd\t{$src, $dst|$dst, $src}",
-                       [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move doubleword from xmm register to r/m32
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPDI2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+                       "vmovd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
                        EVEX;
 def VMOVPDI2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
                        (ins i32mem:$dst, VR128X:$src),
                        "vmovd\t{$src, $dst|$dst, $src}",
-                       [(store (i32 (extractelt (v4i32 VR128X:$src),
-                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
-                       EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt
-
-// Move quadword from xmm1 register to r/m64
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
-                      "vmovq\t{$src, $dst|$dst, $src}",
-                      [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+                       [(store (i32 (extractelt (v4i32 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt
+
+// Move quadword from xmm1 register to r/m64
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
                                                    (iPTR 0)))],
                       IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
                       Requires<[HasAVX512, In64BitMode]>;
@@ -3213,39 +3228,39 @@ def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
 
 let hasSideEffects = 0 in
 def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
-                             (ins VR128X:$src),
-                             "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
-                             EVEX, VEX_W;
-} // ExeDomain = SSEPackedInt
-
-// Move Scalar Single to Double Int
-//
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
-def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
-                      (ins FR32X:$src),
-                      "vmovd\t{$src, $dst|$dst, $src}",
+                             (ins VR128X:$src),
+                             "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
+                             EVEX, VEX_W;
+} // ExeDomain = SSEPackedInt
+
+// Move Scalar Single to Double Int
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
+                      (ins FR32X:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32X:$src))],
                       IIC_SSE_MOVD_ToGP>, EVEX;
 def VMOVSS2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
                       (ins i32mem:$dst, FR32X:$src),
-                      "vmovd\t{$src, $dst|$dst, $src}",
-                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
-                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
-
-// Move Quadword Int to Packed Quadword Int
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
-                      (ins i64mem:$src),
-                      "vmovq\t{$src, $dst|$dst, $src}",
-                      [(set VR128X:$dst,
-                        (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
-                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
-} // ExeDomain = SSEPackedInt
-
-//===----------------------------------------------------------------------===//
-// AVX-512  MOVSS, MOVSD
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move Quadword Int to Packed Quadword Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i64mem:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
+} // ExeDomain = SSEPackedInt
+
+//===----------------------------------------------------------------------===//
+// AVX-512  MOVSS, MOVSD
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_move_scalar<string asm, SDNode OpNode,
@@ -8646,6 +8661,28 @@ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
           (VMOVDDUPZ128rm addr:$src)>;
 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
           (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
+                   (v2f64 VR128X:$src0)),
+          (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
+                   (bitconvert (v4i32 immAllZerosV))),
+          (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+                   (v2f64 VR128X:$src0)),
+          (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
+                           (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+                   (bitconvert (v4i32 immAllZerosV))),
+          (VMOVDDUPZ128rrkz VK2WM:$mask, (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+                   (v2f64 VR128X:$src0)),
+          (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+                   (bitconvert (v4i32 immAllZerosV))),
+          (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 579359794fbd..e3484d062bc8 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -543,7 +543,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::MOV8rr,          X86::MOV8rm,              0 },
     { X86::MOVAPDrr,        X86::MOVAPDrm,            TB_ALIGN_16 },
     { X86::MOVAPSrr,        X86::MOVAPSrm,            TB_ALIGN_16 },
-    { X86::MOVDDUPrr,       X86::MOVDDUPrm,           0 },
+    { X86::MOVDDUPrr,       X86::MOVDDUPrm,           TB_NO_REVERSE },
     { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm,         0 },
     { X86::MOVDI2SSrr,      X86::MOVDI2SSrm,          0 },
     { X86::MOVDQArr,        X86::MOVDQArm,            TB_ALIGN_16 },
@@ -661,7 +661,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
     { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
     { X86::VMOVAPSrr,       X86::VMOVAPSrm,           TB_ALIGN_16 },
-    { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          0 },
+    { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          TB_NO_REVERSE },
     { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
     { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
     { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
@@ -6864,6 +6864,21 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
        .addReg(Reg, RegState::Undef).addImm(0xff);
     return true;
   }
+  case X86::AVX512_512_SEXT_MASK_32:
+  case X86::AVX512_512_SEXT_MASK_64: {
+    unsigned Reg = MIB->getOperand(0).getReg();
+    unsigned MaskReg = MIB->getOperand(1).getReg();
+    unsigned MaskState = getRegState(MIB->getOperand(1));
+    unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
+                   X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
+    MI.RemoveOperand(1);
+    MIB->setDesc(get(Opc));
+    // VPTERNLOG needs 3 register inputs and an immediate.
+    // 0xff will return 1s for any input.
+    MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
+       .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff);
+    return true;
+  }
   case X86::VMOVAPSZ128rm_NOVLX:
     return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
                            get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 4cd6ae563f03..09971d586a41 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -6397,7 +6397,7 @@ let Predicates = [HasAVX] in {
   defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround",
                                  int_x86_sse41_round_ss,
                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
-  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;                                 
+  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
 }
 
 let Predicates = [UseAVX] in {
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index de4839432b9a..107ed9359376 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -144,6 +144,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry AVX512BWUniformConstCostTable[] = {
+    { ISD::SHL,  MVT::v64i8,   2 }, // psllw + pand.
+    { ISD::SRL,  MVT::v64i8,   2 }, // psrlw + pand.
+    { ISD::SRA,  MVT::v64i8,   4 }, // psrlw, pand, pxor, psubb.
+
     { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
     { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
   };
@@ -168,6 +172,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry AVX2UniformConstCostTable[] = {
+    { ISD::SHL,  MVT::v32i8,   2 }, // psllw + pand.
+    { ISD::SRL,  MVT::v32i8,   2 }, // psrlw + pand.
+    { ISD::SRA,  MVT::v32i8,   4 }, // psrlw, pand, pxor, psubb.
+
     { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
 
     { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
@@ -184,6 +192,14 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry SSE2UniformConstCostTable[] = {
+    { ISD::SHL,  MVT::v16i8,   2 }, // psllw + pand.
+    { ISD::SRL,  MVT::v16i8,   2 }, // psrlw + pand.
+    { ISD::SRA,  MVT::v16i8,   4 }, // psrlw, pand, pxor, psubb.
+
+    { ISD::SHL,  MVT::v32i8,   4 }, // 2*(psllw + pand).
+    { ISD::SRL,  MVT::v32i8,   4 }, // 2*(psrlw + pand).
+    { ISD::SRA,  MVT::v32i8,   8 }, // 2*(psrlw, pand, pxor, psubb).
+
     { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
     { ISD::SDIV, MVT::v8i16,   6 }, // pmulhw sequence
     { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
@@ -207,6 +223,43 @@ int X86TTIImpl::getArithmeticInstrCost(
       return LT.first * Entry->Cost;
   }
 
+  static const CostTblEntry AVX2UniformCostTable[] = {
+    // Uniform splats are cheaper for the following instructions.
+    { ISD::SHL,  MVT::v16i16, 1 }, // psllw.
+    { ISD::SRL,  MVT::v16i16, 1 }, // psrlw.
+    { ISD::SRA,  MVT::v16i16, 1 }, // psraw.
+  };
+
+  if (ST->hasAVX2() &&
+      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+       (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+    if (const auto *Entry =
+            CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry SSE2UniformCostTable[] = {
+    // Uniform splats are cheaper for the following instructions.
+    { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
+    { ISD::SHL,  MVT::v4i32,  1 }, // pslld
+    { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
+
+    { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
+    { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
+
+    { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
+    { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+  };
+
+  if (ST->hasSSE2() &&
+      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+       (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+    if (const auto *Entry =
+            CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
   static const CostTblEntry AVX512DQCostTable[] = {
     { ISD::MUL,  MVT::v2i64, 1 },
     { ISD::MUL,  MVT::v4i64, 1 },
@@ -219,6 +272,10 @@ int X86TTIImpl::getArithmeticInstrCost(
       return LT.first * Entry->Cost;
 
   static const CostTblEntry AVX512BWCostTable[] = {
+    { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
+    { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
+    { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
+
     { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.
@@ -259,7 +316,7 @@ int X86TTIImpl::getArithmeticInstrCost(
     if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
 
-  static const CostTblEntry AVX2CostTable[] = {
+  static const CostTblEntry AVX2ShiftCostTable[] = {
     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
     // customize them to detect the cases where shift amount is a scalar one.
     { ISD::SHL,     MVT::v4i32,    1 },
@@ -283,11 +340,11 @@ int X86TTIImpl::getArithmeticInstrCost(
       // is lowered into a vector multiply (vpmullw).
       return LT.first;
 
-    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+    if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
   }
 
-  static const CostTblEntry XOPCostTable[] = {
+  static const CostTblEntry XOPShiftCostTable[] = {
     // 128bit shifts take 1cy, but right shifts require negation beforehand.
     { ISD::SHL,     MVT::v16i8,    1 },
     { ISD::SRL,     MVT::v16i8,    2 },
@@ -318,93 +375,20 @@ int X86TTIImpl::getArithmeticInstrCost(
 
   // Look for XOP lowering tricks.
   if (ST->hasXOP())
-    if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
+    if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
 
-  static const CostTblEntry AVX2CustomCostTable[] = {
-    { ISD::SHL,  MVT::v32i8,      11 }, // vpblendvb sequence.
-    { ISD::SHL,  MVT::v16i16,     10 }, // extend/vpsrlvd/pack sequence.
-
-    { ISD::SRL,  MVT::v32i8,      11 }, // vpblendvb sequence.
-    { ISD::SRL,  MVT::v16i16,     10 }, // extend/vpsrlvd/pack sequence.
-
-    { ISD::SRA,  MVT::v32i8,      24 }, // vpblendvb sequence.
-    { ISD::SRA,  MVT::v16i16,     10 }, // extend/vpsravd/pack sequence.
-    { ISD::SRA,  MVT::v2i64,       4 }, // srl/xor/sub sequence.
-    { ISD::SRA,  MVT::v4i64,       4 }, // srl/xor/sub sequence.
-
-    { ISD::MUL,   MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,   MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,   MVT::v8i32,      1 }, // pmulld
-    { ISD::MUL,   MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
-
-    { ISD::FDIV,  MVT::f32,        7 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV,  MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV,  MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV,  MVT::f64,       14 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV,  MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
-    { ISD::FDIV,  MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
-  };
-
-  // Look for AVX2 lowering tricks for custom cases.
-  if (ST->hasAVX2())
-    if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
-                                            LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry AVXCustomCostTable[] = {
-    { ISD::MUL,   MVT::v32i8,  26 }, // extend/pmullw/trunc sequence.
-
-    { ISD::FDIV,  MVT::f32,    14 }, // SNB from http://www.agner.org/
-    { ISD::FDIV,  MVT::v4f32,  14 }, // SNB from http://www.agner.org/
-    { ISD::FDIV,  MVT::v8f32,  28 }, // SNB from http://www.agner.org/
-    { ISD::FDIV,  MVT::f64,    22 }, // SNB from http://www.agner.org/
-    { ISD::FDIV,  MVT::v2f64,  22 }, // SNB from http://www.agner.org/
-    { ISD::FDIV,  MVT::v4f64,  44 }, // SNB from http://www.agner.org/
-
-    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
-    { ISD::SDIV,  MVT::v32i8,  32*20 },
-    { ISD::SDIV,  MVT::v16i16, 16*20 },
-    { ISD::SDIV,  MVT::v8i32,  8*20 },
-    { ISD::SDIV,  MVT::v4i64,  4*20 },
-    { ISD::UDIV,  MVT::v32i8,  32*20 },
-    { ISD::UDIV,  MVT::v16i16, 16*20 },
-    { ISD::UDIV,  MVT::v8i32,  8*20 },
-    { ISD::UDIV,  MVT::v4i64,  4*20 },
-  };
-
-  // Look for AVX2 lowering tricks for custom cases.
-  if (ST->hasAVX())
-    if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD,
-                                            LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry
-  SSE2UniformCostTable[] = {
+  static const CostTblEntry SSE2UniformShiftCostTable[] = {
     // Uniform splats are cheaper for the following instructions.
-    { ISD::SHL,  MVT::v16i8,  1 }, // psllw.
-    { ISD::SHL,  MVT::v32i8,  2 }, // psllw.
-    { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
     { ISD::SHL,  MVT::v16i16, 2 }, // psllw.
-    { ISD::SHL,  MVT::v4i32,  1 }, // pslld
     { ISD::SHL,  MVT::v8i32,  2 }, // pslld
-    { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
     { ISD::SHL,  MVT::v4i64,  2 }, // psllq.
 
-    { ISD::SRL,  MVT::v16i8,  1 }, // psrlw.
-    { ISD::SRL,  MVT::v32i8,  2 }, // psrlw.
-    { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
     { ISD::SRL,  MVT::v16i16, 2 }, // psrlw.
-    { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
     { ISD::SRL,  MVT::v8i32,  2 }, // psrld.
-    { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
     { ISD::SRL,  MVT::v4i64,  2 }, // psrlq.
 
-    { ISD::SRA,  MVT::v16i8,  4 }, // psrlw, pand, pxor, psubb.
-    { ISD::SRA,  MVT::v32i8,  8 }, // psrlw, pand, pxor, psubb.
-    { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
     { ISD::SRA,  MVT::v16i16, 2 }, // psraw.
-    { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
     { ISD::SRA,  MVT::v8i32,  2 }, // psrad.
     { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
     { ISD::SRA,  MVT::v4i64,  8 }, // 2 x psrad + shuffle.
@@ -414,7 +398,7 @@ int X86TTIImpl::getArithmeticInstrCost(
       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
     if (const auto *Entry =
-            CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
+            CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
   }
 
@@ -422,24 +406,98 @@ int X86TTIImpl::getArithmeticInstrCost(
       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
     MVT VT = LT.second;
     // Vector shift left by non uniform constant can be lowered
-    // into vector multiply (pmullw/pmulld).
-    if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
-        (VT == MVT::v4i32 && ST->hasSSE41()))
-      return LT.first;
-
-    // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
-    // sequence of extract + two vector multiply + insert.
-    if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
-       (ST->hasAVX() && !ST->hasAVX2()))
-      ISD = ISD::MUL;
-
-    // A vector shift left by non uniform constant is converted
-    // into a vector multiply; the new multiply is eventually
-    // lowered into a sequence of shuffles and 2 x pmuludq.
-    if (VT == MVT::v4i32 && ST->hasSSE2())
+    // into vector multiply.
+    if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
+        ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
       ISD = ISD::MUL;
   }
 
+  static const CostTblEntry AVX2CostTable[] = {
+    { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
+    { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
+
+    { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
+    { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
+
+    { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
+    { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
+    { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
+
+    { ISD::SUB,  MVT::v32i8,      1 }, // psubb
+    { ISD::ADD,  MVT::v32i8,      1 }, // paddb
+    { ISD::SUB,  MVT::v16i16,     1 }, // psubw
+    { ISD::ADD,  MVT::v16i16,     1 }, // paddw
+    { ISD::SUB,  MVT::v8i32,      1 }, // psubd
+    { ISD::ADD,  MVT::v8i32,      1 }, // paddd
+    { ISD::SUB,  MVT::v4i64,      1 }, // psubq
+    { ISD::ADD,  MVT::v4i64,      1 }, // paddq
+
+    { ISD::MUL,  MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v16i16,     1 }, // pmullw
+    { ISD::MUL,  MVT::v8i32,      1 }, // pmulld
+    { ISD::MUL,  MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
+
+    { ISD::FDIV, MVT::f32,        7 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::f64,       14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
+  };
+
+  // Look for AVX2 lowering tricks for custom cases.
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+
+  static const CostTblEntry AVX1CostTable[] = {
+    // We don't have to scalarize unsupported ops. We can issue two half-sized
+    // operations and we only need to extract the upper YMM half.
+    // Two ops + 1 extract + 1 insert = 4.
+    { ISD::MUL,     MVT::v16i16,     4 },
+    { ISD::MUL,     MVT::v8i32,      4 },
+    { ISD::SUB,     MVT::v32i8,      4 },
+    { ISD::ADD,     MVT::v32i8,      4 },
+    { ISD::SUB,     MVT::v16i16,     4 },
+    { ISD::ADD,     MVT::v16i16,     4 },
+    { ISD::SUB,     MVT::v8i32,      4 },
+    { ISD::ADD,     MVT::v8i32,      4 },
+    { ISD::SUB,     MVT::v4i64,      4 },
+    { ISD::ADD,     MVT::v4i64,      4 },
+
+    // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+    // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
+    // Because we believe v4i64 to be a legal type, we must also include the
+    // extract+insert in the cost table. Therefore, the cost here is 18
+    // instead of 8.
+    { ISD::MUL,     MVT::v4i64,     18 },
+
+    { ISD::MUL,     MVT::v32i8,     26 }, // extend/pmullw/trunc sequence.
+
+    { ISD::FDIV,    MVT::f32,       14 }, // SNB from http://www.agner.org/
+    { ISD::FDIV,    MVT::v4f32,     14 }, // SNB from http://www.agner.org/
+    { ISD::FDIV,    MVT::v8f32,     28 }, // SNB from http://www.agner.org/
+    { ISD::FDIV,    MVT::f64,       22 }, // SNB from http://www.agner.org/
+    { ISD::FDIV,    MVT::v2f64,     22 }, // SNB from http://www.agner.org/
+    { ISD::FDIV,    MVT::v4f64,     44 }, // SNB from http://www.agner.org/
+
+    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+    { ISD::SDIV,    MVT::v32i8,  32*20 },
+    { ISD::SDIV,    MVT::v16i16, 16*20 },
+    { ISD::SDIV,    MVT::v8i32,   8*20 },
+    { ISD::SDIV,    MVT::v4i64,   4*20 },
+    { ISD::UDIV,    MVT::v32i8,  32*20 },
+    { ISD::UDIV,    MVT::v16i16, 16*20 },
+    { ISD::UDIV,    MVT::v8i32,   8*20 },
+    { ISD::UDIV,    MVT::v4i64,   4*20 },
+  };
+
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+
   static const CostTblEntry SSE42CostTable[] = {
     { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
     { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
@@ -456,6 +514,8 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SHL,  MVT::v32i8,  2*11 }, // pblendvb sequence.
     { ISD::SHL,  MVT::v8i16,    14 }, // pblendvb sequence.
     { ISD::SHL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v4i32,     4 }, // pslld/paddd/cvttps2dq/pmulld
+    { ISD::SHL,  MVT::v8i32,   2*4 }, // pslld/paddd/cvttps2dq/pmulld
 
     { ISD::SRL,  MVT::v16i8,    12 }, // pblendvb sequence.
     { ISD::SRL,  MVT::v32i8,  2*12 }, // pblendvb sequence.
@@ -501,6 +561,7 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v4i64,  2*12 }, // srl/xor/sub sequence.
 
     { ISD::MUL,  MVT::v16i8,    12 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v8i16,     1 }, // pmullw
     { ISD::MUL,  MVT::v4i32,     6 }, // 3*pmuludq/4*shuffle
     { ISD::MUL,  MVT::v2i64,     8 }, // 3*pmuludq/3*shift/2*add
 
@@ -516,46 +577,19 @@ int X86TTIImpl::getArithmeticInstrCost(
     // generally a bad idea. Assume somewhat arbitrarily that we have to be able
     // to hide "20 cycles" for each lane.
     { ISD::SDIV,  MVT::v16i8,  16*20 },
-    { ISD::SDIV,  MVT::v8i16,  8*20 },
-    { ISD::SDIV,  MVT::v4i32,  4*20 },
-    { ISD::SDIV,  MVT::v2i64,  2*20 },
+    { ISD::SDIV,  MVT::v8i16,   8*20 },
+    { ISD::SDIV,  MVT::v4i32,   4*20 },
+    { ISD::SDIV,  MVT::v2i64,   2*20 },
     { ISD::UDIV,  MVT::v16i8,  16*20 },
-    { ISD::UDIV,  MVT::v8i16,  8*20 },
-    { ISD::UDIV,  MVT::v4i32,  4*20 },
-    { ISD::UDIV,  MVT::v2i64,  2*20 },
+    { ISD::UDIV,  MVT::v8i16,   8*20 },
+    { ISD::UDIV,  MVT::v4i32,   4*20 },
+    { ISD::UDIV,  MVT::v2i64,   2*20 },
   };
 
   if (ST->hasSSE2())
     if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
 
-  static const CostTblEntry AVX1CostTable[] = {
-    // We don't have to scalarize unsupported ops. We can issue two half-sized
-    // operations and we only need to extract the upper YMM half.
-    // Two ops + 1 extract + 1 insert = 4.
-    { ISD::MUL,     MVT::v16i16,   4 },
-    { ISD::MUL,     MVT::v8i32,    4 },
-    { ISD::SUB,     MVT::v32i8,    4 },
-    { ISD::ADD,     MVT::v32i8,    4 },
-    { ISD::SUB,     MVT::v16i16,   4 },
-    { ISD::ADD,     MVT::v16i16,   4 },
-    { ISD::SUB,     MVT::v8i32,    4 },
-    { ISD::ADD,     MVT::v8i32,    4 },
-    { ISD::SUB,     MVT::v4i64,    4 },
-    { ISD::ADD,     MVT::v4i64,    4 },
-    // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
-    // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
-    // Because we believe v4i64 to be a legal type, we must also include the
-    // extract+insert in the cost table. Therefore, the cost here is 18
-    // instead of 8.
-    { ISD::MUL,     MVT::v4i64,    18 },
-  };
-
-  // Look for AVX1 lowering tricks.
-  if (ST->hasAVX() && !ST->hasAVX2())
-    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
-      return LT.first * Entry->Cost;
-
   static const CostTblEntry SSE1CostTable[] = {
     { ISD::FDIV, MVT::f32,   17 }, // Pentium III from http://www.agner.org/
     { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
@@ -639,8 +673,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 
     { TTI::SK_Reverse,          MVT::v32i16, 1 }, // vpermw
     { TTI::SK_Reverse,          MVT::v16i16, 1 }, // vpermw
-    { TTI::SK_Reverse,          MVT::v64i8,  6 }, // vextracti64x4 + 2*vperm2i128
-                                                  // + 2*pshufb + vinserti64x4
+    { TTI::SK_Reverse,          MVT::v64i8,  2 }, // pshufb + vshufi64x2
 
     { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
     { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index f4742aaf748f..82daf754be0d 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -42,6 +42,8 @@
 using namespace llvm;
 using namespace lowertypetests;
 
+using SummaryAction = LowerTypeTestsSummaryAction;
+
 #define DEBUG_TYPE "lowertypetests"
 
 STATISTIC(ByteArraySizeBits, "Byte array size in bits");
@@ -55,9 +57,15 @@ static cl::opt<bool> AvoidReuse(
     cl::desc("Try to avoid reuse of byte array addresses using aliases"),
     cl::Hidden, cl::init(true));
 
-static cl::opt<std::string> ClSummaryAction(
+static cl::opt<SummaryAction> ClSummaryAction(
     "lowertypetests-summary-action",
-    cl::desc("What to do with the summary when running this pass"), cl::Hidden);
+    cl::desc("What to do with the summary when running this pass"),
+    cl::values(clEnumValN(SummaryAction::None, "none", "Do nothing"),
+               clEnumValN(SummaryAction::Import, "import",
+                          "Import typeid resolutions from summary and globals"),
+               clEnumValN(SummaryAction::Export, "export",
+                          "Export typeid resolutions to summary and globals")),
+    cl::Hidden);
 
 static cl::opt<std::string> ClReadSummary(
     "lowertypetests-read-summary",
@@ -226,8 +234,8 @@ public:
 class LowerTypeTestsModule {
   Module &M;
 
-  // This is for testing purposes only.
-  std::unique_ptr<ModuleSummaryIndex> OwnedSummary;
+  SummaryAction Action;
+  ModuleSummaryIndex *Summary;
 
   bool LinkerSubsectionsViaSymbols;
   Triple::ArchType Arch;
@@ -319,21 +327,38 @@ class LowerTypeTestsModule {
   void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
 
 public:
-  LowerTypeTestsModule(Module &M);
-  ~LowerTypeTestsModule();
+  LowerTypeTestsModule(Module &M, SummaryAction Action,
+                       ModuleSummaryIndex *Summary);
   bool lower();
+
+  // Lower the module using the action and summary passed as command line
+  // arguments. For testing purposes only.
+  static bool runForTesting(Module &M);
 };
 
 struct LowerTypeTests : public ModulePass {
   static char ID;
-  LowerTypeTests() : ModulePass(ID) {
+
+  bool UseCommandLine = false;
+
+  SummaryAction Action;
+  ModuleSummaryIndex *Summary;
+
+  LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
+    initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+  }
+
+  LowerTypeTests(SummaryAction Action, ModuleSummaryIndex *Summary)
+      : ModulePass(ID), Action(Action), Summary(Summary) {
     initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
-    return LowerTypeTestsModule(M).lower();
+    if (UseCommandLine)
+      return LowerTypeTestsModule::runForTesting(M);
+    return LowerTypeTestsModule(M, Action, Summary).lower();
   }
 };
 
@@ -343,7 +368,10 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
                 false)
 char LowerTypeTests::ID = 0;
 
-ModulePass *llvm::createLowerTypeTestsPass() { return new LowerTypeTests; }
+ModulePass *llvm::createLowerTypeTestsPass(SummaryAction Action,
+                                           ModuleSummaryIndex *Summary) {
+  return new LowerTypeTests(Action, Summary);
+}
 
 /// Build a bit set for TypeId using the object layouts in
 /// GlobalLayout.
@@ -1145,22 +1173,12 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
 }
 
 /// Lower all type tests in this module.
-LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) {
-  // Handle the command-line summary arguments. This code is for testing
-  // purposes only, so we handle errors directly.
-  if (!ClSummaryAction.empty()) {
-    OwnedSummary = make_unique<ModuleSummaryIndex>();
-    if (!ClReadSummary.empty()) {
-      ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
-                            ": ");
-      auto ReadSummaryFile =
-          ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
-
-      yaml::Input In(ReadSummaryFile->getBuffer());
-      In >> *OwnedSummary;
-      ExitOnErr(errorCodeToError(In.error()));
-    }
-  }
+LowerTypeTestsModule::LowerTypeTestsModule(Module &M, SummaryAction Action,
+                                           ModuleSummaryIndex *Summary)
+    : M(M), Action(Action), Summary(Summary) {
+  // FIXME: Use these fields.
+  (void)this->Action;
+  (void)this->Summary;
 
   Triple TargetTriple(M.getTargetTriple());
   LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
@@ -1169,18 +1187,36 @@ LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) {
   ObjectFormat = TargetTriple.getObjectFormat();
 }
 
-LowerTypeTestsModule::~LowerTypeTestsModule() {
-  if (ClSummaryAction.empty() || ClWriteSummary.empty())
-    return;
+bool LowerTypeTestsModule::runForTesting(Module &M) {
+  ModuleSummaryIndex Summary;
 
-  ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
-                        ": ");
-  std::error_code EC;
-  raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
-  ExitOnErr(errorCodeToError(EC));
+  // Handle the command-line summary arguments. This code is for testing
+  // purposes only, so we handle errors directly.
+  if (!ClReadSummary.empty()) {
+    ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
+                          ": ");
+    auto ReadSummaryFile =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+    yaml::Input In(ReadSummaryFile->getBuffer());
+    In >> Summary;
+    ExitOnErr(errorCodeToError(In.error()));
+  }
+
+  bool Changed = LowerTypeTestsModule(M, ClSummaryAction, &Summary).lower();
+
+  if (!ClWriteSummary.empty()) {
+    ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
+                          ": ");
+    std::error_code EC;
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+    ExitOnErr(errorCodeToError(EC));
+
+    yaml::Output Out(OS);
+    Out << Summary;
+  }
 
-  yaml::Output Out(OS);
-  Out << *OwnedSummary;
+  return Changed;
 }
 
 bool LowerTypeTestsModule::lower() {
@@ -1313,7 +1349,8 @@ bool LowerTypeTestsModule::lower() {
 
 PreservedAnalyses LowerTypeTestsPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
-  bool Changed = LowerTypeTestsModule(M).lower();
+  bool Changed =
+      LowerTypeTestsModule(M, SummaryAction::None, /*Summary=*/nullptr).lower();
   if (!Changed)
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 293ddf21a68f..d086ee05a64f 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -857,7 +857,8 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   // Lower type metadata and the type.test intrinsic. This pass supports Clang's
   // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
   // link time if CFI is enabled. The pass does nothing if CFI is disabled.
-  PM.add(createLowerTypeTestsPass());
+  PM.add(createLowerTypeTestsPass(LowerTypeTestsSummaryAction::None,
+                                  /*Summary=*/nullptr));
 
   if (OptLevel != 0)
     addLateLTOOptimizationPasses(PM);
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 012bfc7b4944..013159cde774 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1903,7 +1903,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
     return foldICmpShlOne(Cmp, Shl, C);
 
   // Check that the shift amount is in range. If not, don't perform undefined
-  // shifts. When the shift is visited it will be simplified.
+  // shifts. When the shift is visited, it will be simplified.
   unsigned TypeBits = C->getBitWidth();
   if (ShiftAmt->uge(TypeBits))
     return nullptr;
@@ -1923,7 +1923,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
       return new ICmpInst(Pred, X, LShrC);
 
     if (Shl->hasOneUse()) {
-      // Otherwise strength reduce the shift into an and.
+      // Otherwise, strength reduce the shift into an and.
       Constant *Mask = ConstantInt::get(Shl->getType(),
           APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
 
@@ -1951,7 +1951,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
   }
 
   // When the shift is nuw and pred is >u or <=u, comparison only really happens
-  // in the pre-shifted bits. Since InstSimplify canoncalizes <=u into <u, the
+  // in the pre-shifted bits. Since InstSimplify canonicalizes <=u into <u, the
   // <=u case can be further converted to match <u (see below).
   if (Shl->hasNoUnsignedWrap() &&
       (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT)) {
@@ -1970,9 +1970,9 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
   // Transform (icmp pred iM (shl iM %v, N), C)
   // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N))
   // Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N.
-  // This enables us to get rid of the shift in favor of a trunc which can be
+  // This enables us to get rid of the shift in favor of a trunc that may be
   // free on the target. It has the additional benefit of comparing to a
-  // smaller constant, which will be target friendly.
+  // smaller constant that may be more target-friendly.
   unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1);
   if (Shl->hasOneUse() && Amt != 0 && C->countTrailingZeros() >= Amt &&
       DL.isLegalInteger(TypeBits - Amt)) {
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 1d5528398776..54bdc9e0772b 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1818,6 +1818,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
     RegisteredFlag = new GlobalVariable(
         M, IntptrTy, false, GlobalVariable::CommonLinkage,
         ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+    RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
 
     // Update llvm.compiler.used, adding the new liveness globals. This is
     // needed so that during LTO these variables stay alive. The alternative
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 6aeb5237ffe3..68faa886060a 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1423,7 +1423,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
     if (widenLoopCompare(DU))
       return nullptr;
 
-    // This user does not evaluate to a recurence after widening, so don't
+    // This user does not evaluate to a recurrence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
     truncateIVUse(DU, DT, LI);
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 08e7acdaaf72..8fb580183e30 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -415,7 +415,9 @@ public:
     Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
                                           PH->getTerminator());
     Value *Initial =
-        new LoadInst(InitialPtr, "load_initial", PH->getTerminator());
+        new LoadInst(InitialPtr, "load_initial", /* isVolatile */ false,
+                     Cand.Load->getAlignment(), PH->getTerminator());
+
     PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
                                    &L->getHeader()->front());
     PHI->addIncoming(Initial, PH);
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 6f7682c96cef..76fe91884c7b 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -1382,8 +1382,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
                                    Succ->begin(), Succ->end());
         LPM->deleteSimpleAnalysisValue(BI, L);
-        BI->eraseFromParent();
         RemoveFromWorklist(BI, Worklist);
+        BI->eraseFromParent();
 
         // Remove Succ from the loop tree.
         LI->removeBlock(Succ);
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 8b8236390bf4..eef7db08cd46 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -79,7 +79,8 @@ STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
 STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
 STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
 STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
-STATISTIC(NumGVNMaxIterations, "Maximum Number of iterations it took to converge GVN");
+STATISTIC(NumGVNMaxIterations,
+          "Maximum Number of iterations it took to converge GVN");
 
 //===----------------------------------------------------------------------===//
 //                                GVN Pass
@@ -327,7 +328,7 @@ private:
   // Elimination.
   struct ValueDFS;
   void convertDenseToDFSOrdered(CongruenceClass::MemberSet &,
-                                std::vector<ValueDFS> &);
+                                SmallVectorImpl<ValueDFS> &);
 
   bool eliminateInstructions(Function &);
   void replaceInstruction(Instruction *, Value *);
@@ -336,8 +337,11 @@ private:
 
   // New instruction creation.
   void handleNewInstruction(Instruction *){};
+
+  // Various instruction touch utilities
   void markUsersTouched(Value *);
   void markMemoryUsersTouched(MemoryAccess *);
+  void markLeaderChangeTouched(CongruenceClass *CC);
 
   // Utilities.
   void cleanupTables();
@@ -390,10 +394,10 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false)
 
 PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
-  BasicBlock *PhiBlock = I->getParent();
+  BasicBlock *PHIBlock = I->getParent();
   auto *PN = cast<PHINode>(I);
-  auto *E = new (ExpressionAllocator)
-      PHIExpression(PN->getNumOperands(), I->getParent());
+  auto *E =
+      new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock);
 
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
   E->setType(I->getType());
@@ -408,10 +412,10 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
 
   std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
                  [&](const Use &U) -> Value * {
-                   // Don't try to transform self-defined phis
+                   // Don't try to transform self-defined phis.
                    if (U == PN)
                      return PN;
-                   const BasicBlockEdge BBE(PN->getIncomingBlock(U), PhiBlock);
+                   const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock);
                    return lookupOperandLeader(U, I, BBE);
                  });
   return E;
@@ -710,6 +714,15 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
   return E;
 }
 
+// Utility function to check whether the congruence class has a member other
+// than the given instruction.
+bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
+  // Either it has more than one member, in which case it must contain something
+  // other than us (because it's indexed by value), or if it only has one member
+  // right now, that member should not be us.
+  return CC->Members.size() > 1 || CC->Members.count(I) == 0;
+}
+
 const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
                                                          const BasicBlock *B) {
   // Unlike loads, we never try to eliminate stores, so we do not check if they
@@ -725,8 +738,12 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
         cast<MemoryDef>(StoreAccess)->getDefiningAccess());
     const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
     CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
+    // Basically, check if the congruence class the store is in is defined by a
+    // store that isn't us, and has the same value.  MemorySSA takes care of
+    // ensuring the store has the same memory state as us already.
     if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) &&
-        CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B))
+        CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B) &&
+        hasMemberOtherThanUs(CC, I))
       return createStoreExpression(SI, StoreRHS, B);
   }
 
@@ -810,36 +827,50 @@ bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To) {
 const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
                                                        const BasicBlock *B) {
   auto *E = cast<PHIExpression>(createPHIExpression(I));
-  if (E->op_empty()) {
+  // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
+
+  // See if all arguaments are the same.
+  // We track if any were undef because they need special handling.
+  bool HasUndef = false;
+  auto Filtered = make_filter_range(E->operands(), [&](const Value *Arg) {
+    if (Arg == I)
+      return false;
+    if (isa<UndefValue>(Arg)) {
+      HasUndef = true;
+      return false;
+    }
+    return true;
+  });
+  // If we are left with no operands, it's undef
+  if (Filtered.begin() == Filtered.end()) {
     DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef"
                  << "\n");
     E->deallocateOperands(ArgRecycler);
     ExpressionAllocator.Deallocate(E);
     return createConstantExpression(UndefValue::get(I->getType()));
   }
-
-  Value *AllSameValue = E->getOperand(0);
-
-  // See if all arguments are the same, ignoring undef arguments, because we can
-  // choose a value that is the same for them.
-  for (const Value *Arg : E->operands())
-    if (Arg != AllSameValue && !isa<UndefValue>(Arg)) {
-      AllSameValue = nullptr;
-      break;
+  Value *AllSameValue = *(Filtered.begin());
+  ++Filtered.begin();
+  // Can't use std::equal here, sadly, because filter.begin moves.
+  if (llvm::all_of(Filtered, [AllSameValue](const Value *V) {
+        return V == AllSameValue;
+      })) {
+    // In LLVM's non-standard representation of phi nodes, it's possible to have
+    // phi nodes with cycles (IE dependent on other phis that are .... dependent
+    // on the original phi node), especially in weird CFG's where some arguments
+    // are unreachable, or uninitialized along certain paths.  This can cause
+    // infinite loops during evaluation. We work around this by not trying to
+    // really evaluate them independently, but instead using a variable
+    // expression to say if one is equivalent to the other.
+    // We also special case undef, so that if we have an undef, we can't use the
+    // common value unless it dominates the phi block.
+    if (HasUndef) {
+      // Only have to check for instructions
+      if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
+        if (!DT->dominates(AllSameInst, I))
+          return E;
     }
 
-  if (AllSameValue) {
-    // It's possible to have phi nodes with cycles (IE dependent on
-    // other phis that are .... dependent on the original phi node),
-    // especially in weird CFG's where some arguments are unreachable, or
-    // uninitialized along certain paths.
-    // This can cause infinite loops  during evaluation (even if you disable
-    // the recursion below, you will simply ping-pong between congruence
-    // classes). If a phi node symbolically evaluates to another phi node,
-    // just leave it alone. If they are really the same, we will still
-    // eliminate them in favor of each other.
-    if (isa<PHINode>(AllSameValue))
-      return E;
     NumGVNPhisAllSame++;
     DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
                  << "\n");
@@ -1007,12 +1038,22 @@ void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) {
   }
 }
 
+// Touch the instructions that need to be updated after a congruence class has a
+// leader change, and mark changed values.
+void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : CC->Members) {
+    if (auto *I = dyn_cast<Instruction>(M))
+      TouchedInstructions.set(InstrDFS[I]);
+    ChangedValues.insert(M);
+  }
+}
+
 // Perform congruence finding on a given value numbering expression.
 void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
-
   ValueToExpression[V] = E;
   // This is guaranteed to return something, since it will at least find
   // INITIAL.
+
   CongruenceClass *VClass = ValueToClass[V];
   assert(VClass && "Should have found a vclass");
   // Dead classes should have been eliminated from the mapping.
@@ -1031,14 +1072,17 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
       place->second = NewClass;
 
       // Constants and variables should always be made the leader.
-      if (const auto *CE = dyn_cast<ConstantExpression>(E))
+      if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
         NewClass->RepLeader = CE->getConstantValue();
-      else if (const auto *VE = dyn_cast<VariableExpression>(E))
-        NewClass->RepLeader = VE->getVariableValue();
-      else if (const auto *SE = dyn_cast<StoreExpression>(E))
-        NewClass->RepLeader = SE->getStoreInst()->getValueOperand();
-      else
+      } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
+        StoreInst *SI = SE->getStoreInst();
+        NewClass->RepLeader =
+            lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+      } else {
         NewClass->RepLeader = V;
+      }
+      assert(!isa<VariableExpression>(E) &&
+             "VariableExpression should have been handled already");
 
       EClass = NewClass;
       DEBUG(dbgs() << "Created new congruence class for " << *V
@@ -1077,14 +1121,11 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
           ExpressionToClass.erase(VClass->DefiningExpr);
         }
       } else if (VClass->RepLeader == V) {
-        // FIXME: When the leader changes, the value numbering of
-        // everything may change, so we need to reprocess.
+        // When the leader changes, the value numbering of
+        // everything may change due to symbolization changes, so we need to
+        // reprocess.
         VClass->RepLeader = *(VClass->Members.begin());
-        for (auto M : VClass->Members) {
-          if (auto *I = dyn_cast<Instruction>(M))
-            TouchedInstructions.set(InstrDFS[I]);
-          ChangedValues.insert(M);
-        }
+        markLeaderChangeTouched(VClass);
       }
     }
 
@@ -1106,6 +1147,27 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
         markMemoryUsersTouched(MA);
       }
     }
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(V)) {
+    // There is, sadly, one complicating thing for stores.  Stores do not
+    // produce values, only consume them.  However, in order to make loads and
+    // stores value number the same, we ignore the value operand of the store.
+    // But the value operand will still be the leader of our class, and thus, it
+    // may change.  Because the store is a use, the store will get reprocessed,
+    // but nothing will change about it, and so nothing above will catch it
+    // (since the class will not change).  In order to make sure everything ends
+    // up okay, we need to recheck the leader of the class.  Since stores of
+    // different values value number differently due to different memorydefs, we
+    // are guaranteed the leader is always the same between stores in the same
+    // class.
+    DEBUG(dbgs() << "Checking store leader\n");
+    auto ProperLeader =
+        lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+    if (EClass->RepLeader != ProperLeader) {
+      DEBUG(dbgs() << "Store leader changed, fixing\n");
+      EClass->RepLeader = ProperLeader;
+      markLeaderChangeTouched(EClass);
+      markMemoryUsersTouched(MSSA->getMemoryAccess(SI));
+    }
   }
 }
 
@@ -1708,8 +1770,9 @@ struct NewGVN::ValueDFS {
   }
 };
 
-void NewGVN::convertDenseToDFSOrdered(CongruenceClass::MemberSet &Dense,
-                                      std::vector<ValueDFS> &DFSOrderedSet) {
+void NewGVN::convertDenseToDFSOrdered(
+    CongruenceClass::MemberSet &Dense,
+    SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
   for (auto D : Dense) {
     // First add the value.
     BasicBlock *BB = getBlockForValue(D);
@@ -1972,21 +2035,25 @@ bool NewGVN::eliminateInstructions(Function &F) {
         ValueDFSStack EliminationStack;
 
         // Convert the members to DFS ordered sets and then merge them.
-        std::vector<ValueDFS> DFSOrderedSet;
+        SmallVector<ValueDFS, 8> DFSOrderedSet;
         convertDenseToDFSOrdered(CC->Members, DFSOrderedSet);
 
         // Sort the whole thing.
-        sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
-
-        for (auto &C : DFSOrderedSet) {
-          int MemberDFSIn = C.DFSIn;
-          int MemberDFSOut = C.DFSOut;
-          Value *Member = C.Val;
-          Use *MemberUse = C.U;
-
-          // We ignore void things because we can't get a value from them.
-          if (Member && Member->getType()->isVoidTy())
-            continue;
+        std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
+
+        for (auto &VD : DFSOrderedSet) {
+          int MemberDFSIn = VD.DFSIn;
+          int MemberDFSOut = VD.DFSOut;
+          Value *Member = VD.Val;
+          Use *MemberUse = VD.U;
+
+          if (Member) {
+            // We ignore void things because we can't get a value from them.
+            // FIXME: We could actually use this to kill dead stores that are
+            // dominated by equivalent earlier stores.
+            if (Member->getType()->isVoidTy())
+              continue;
+          }
 
           if (EliminationStack.empty()) {
             DEBUG(dbgs() << "Elimination Stack is empty\n");
@@ -1995,8 +2062,6 @@ bool NewGVN::eliminateInstructions(Function &F) {
                          << EliminationStack.dfs_back().first << ","
                          << EliminationStack.dfs_back().second << ")\n");
           }
-          if (Member && isa<Constant>(Member))
-            assert(isa<Constant>(CC->RepLeader));
 
           DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
                        << MemberDFSOut << ")\n");
@@ -2037,11 +2102,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
             continue;
           Value *Result = EliminationStack.back();
 
-          // Don't replace our existing users with ourselves, and don't replace
-          // phi node arguments with the result of the same phi node.
-          // IE tmp = phi(tmp11, undef); tmp11 = foo -> tmp = phi(tmp, undef)
-          if (MemberUse->get() == Result ||
-              (isa<PHINode>(Result) && MemberUse->getUser() == Result))
+          // Don't replace our existing users with ourselves.
+          if (MemberUse->get() == Result)
             continue;
 
           DEBUG(dbgs() << "Found replacement " << *Result << " for "
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 8a6be97d08c7..34be90692481 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -511,9 +511,6 @@ private:
   void visitSelectInst(SelectInst &I);
   void visitBinaryOperator(Instruction &I);
   void visitCmpInst(CmpInst &I);
-  void visitExtractElementInst(ExtractElementInst &I);
-  void visitInsertElementInst(InsertElementInst &I);
-  void visitShuffleVectorInst(ShuffleVectorInst &I);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
   void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
@@ -970,21 +967,6 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
   markOverdefined(&I);
 }
 
-void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
-  // TODO : SCCP does not handle vectors properly.
-  return markOverdefined(&I);
-}
-
-void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
-  // TODO : SCCP does not handle vectors properly.
-  return markOverdefined(&I);
-}
-
-void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
-  // TODO : SCCP does not handle vectors properly.
-  return markOverdefined(&I);
-}
-
 // Handle getelementptr instructions.  If all operands are constants then we
 // can turn this into a getelementptr ConstantExpr.
 //
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index 678d02e05d42..9844190ef84a 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -67,12 +67,15 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
     return true;
   }
 
-  // When exporting, consult the index.
-  auto Summaries = ImportIndex.findGlobalValueSummaryList(SGV->getGUID());
-  assert(Summaries != ImportIndex.end() &&
-         "Missing summary for global value when exporting");
-  assert(Summaries->second.size() == 1 && "Local has more than one summary");
-  auto Linkage = Summaries->second.front()->linkage();
+  // When exporting, consult the index. We can have more than one local
+  // with the same GUID, in the case of same-named locals in different but
+  // same-named source files that were compiled in their respective directories
+  // (so the source file name and resulting GUID is the same). Find the one
+  // in this module.
+  auto Summary = ImportIndex.findSummaryInModule(
+      SGV->getGUID(), SGV->getParent()->getModuleIdentifier());
+  assert(Summary && "Missing summary for global value when exporting");
+  auto Linkage = Summary->linkage();
   if (!GlobalValue::isLocalLinkage(Linkage)) {
     assert(!isNonRenamableLocal(*SGV) &&
            "Attempting to promote non-renamable local");
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index c8f030f7eb83..11d54bcf4f89 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1189,19 +1189,11 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  Value *Ret = nullptr;
   StringRef Name = Callee->getName();
   if (Name == "fabs" && hasFloatVersion(Name))
-    Ret = optimizeUnaryDoubleFP(CI, B, false);
+    return optimizeUnaryDoubleFP(CI, B, false);
 
-  Value *Op = CI->getArgOperand(0);
-  if (Instruction *I = dyn_cast<Instruction>(Op)) {
-    // Fold fabs(x * x) -> x * x; any squared FP value must already be positive.
-    if (I->getOpcode() == Instruction::FMul)
-      if (I->getOperand(0) == I->getOperand(1))
-        return Op;
-  }
-  return Ret;
+  return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 31daba2248aa..578c65daf7c0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -783,6 +783,10 @@ protected:
   // Similarly, we create a new latch condition when setting up the structure
   // of the new loop, so the old one can become dead.
   SmallPtrSet<Instruction *, 4> DeadInstructions;
+
+  // Holds the end values for each induction variable. We save the end values
+  // so we can later fix-up the external users of the induction variables.
+  DenseMap<PHINode *, Value *> IVEndValues;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -1879,13 +1883,6 @@ public:
   unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
                                  unsigned LoopCost);
 
-  /// \return The most profitable unroll factor.
-  /// This method finds the best unroll-factor based on register pressure and
-  /// other parameters. VF and LoopCost are the selected vectorization factor
-  /// and the cost of the selected VF.
-  unsigned computeInterleaveCount(bool OptForSize, unsigned VF,
-                                  unsigned LoopCost);
-
   /// \brief A struct that represents some properties of the register usage
   /// of a loop.
   struct RegisterUsage {
@@ -3424,7 +3421,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
     // Create phi nodes to merge from the  backedge-taken check block.
     PHINode *BCResumeVal = PHINode::Create(
         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
-    Value *EndValue;
+    Value *&EndValue = IVEndValues[OrigPhi];
     if (OrigPhi == OldInduction) {
       // We know what the end value is.
       EndValue = CountRoundDown;
@@ -3443,9 +3440,6 @@ void InnerLoopVectorizer::createEmptyLoop() {
     // or the value at the end of the vectorized loop.
     BCResumeVal->addIncoming(EndValue, MiddleBlock);
 
-    // Fix up external users of the induction variable.
-    fixupIVUsers(OrigPhi, II, CountRoundDown, EndValue, MiddleBlock);
-
     // Fix the scalar body counter (PHI node).
     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
 
@@ -4116,11 +4110,23 @@ void InnerLoopVectorizer::vectorizeLoop() {
     Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
   } // end of for each Phi in PHIsToFix.
 
-  fixLCSSAPHIs();
-
-  // Make sure DomTree is updated.
+  // Update the dominator tree.
+  //
+  // FIXME: After creating the structure of the new loop, the dominator tree is
+  //        no longer up-to-date, and it remains that way until we update it
+  //        here. An out-of-date dominator tree is problematic for SCEV,
+  //        because SCEVExpander uses it to guide code generation. The
+  //        vectorizer use SCEVExpanders in several places. Instead, we should
+  //        keep the dominator tree up-to-date as we go.
   updateAnalysis();
 
+  // Fix-up external users of the induction variables.
+  for (auto &Entry : *Legal->getInductionVars())
+    fixupIVUsers(Entry.first, Entry.second,
+                 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
+                 IVEndValues[Entry.first], LoopMiddleBlock);
+
+  fixLCSSAPHIs();
   predicateInstructions();
 
   // Remove redundant induction instructions.
diff --git a/test/Analysis/CostModel/X86/shuffle-reverse.ll b/test/Analysis/CostModel/X86/shuffle-reverse.ll
index a1bdda0690aa..627d79857434 100644
--- a/test/Analysis/CostModel/X86/shuffle-reverse.ll
+++ b/test/Analysis/CostModel/X86/shuffle-reverse.ll
@@ -161,7 +161,7 @@ define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512)
   ; AVX1: cost of 8 {{.*}} %V512 = shufflevector
   ; AVX2: cost of 4 {{.*}} %V512 = shufflevector
   ; AVX512F: cost of 4 {{.*}} %V512 = shufflevector
-  ; AVX512BW: cost of 6 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 2 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
   ret void
diff --git a/test/Analysis/CostModel/X86/testshiftlshr.ll b/test/Analysis/CostModel/X86/testshiftlshr.ll
index 52f176fe4d63..e5fff9b5e4da 100644
--- a/test/Analysis/CostModel/X86/testshiftlshr.ll
+++ b/test/Analysis/CostModel/X86/testshiftlshr.ll
@@ -498,7 +498,7 @@ entry:
 define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
 entry:
   ; SSE2: shift16i8c
-  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2: cost of 2 {{.*}} lshr
   ; SSE2-CODEGEN: shift16i8c
   ; SSE2-CODEGEN: psrlw $3
 
@@ -513,7 +513,7 @@ entry:
 define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
 entry:
   ; SSE2: shift32i8c
-  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2: cost of 4 {{.*}} lshr
   ; SSE2-CODEGEN: shift32i8c
   ; SSE2-CODEGEN: psrlw $3
 
diff --git a/test/Analysis/CostModel/X86/testshiftshl.ll b/test/Analysis/CostModel/X86/testshiftshl.ll
index e385c5bfeeac..6628b9b87986 100644
--- a/test/Analysis/CostModel/X86/testshiftshl.ll
+++ b/test/Analysis/CostModel/X86/testshiftshl.ll
@@ -498,7 +498,7 @@ entry:
 define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
 entry:
   ; SSE2: shift16i8c
-  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2: cost of 2 {{.*}} shl
   ; SSE2-CODEGEN: shift16i8c
   ; SSE2-CODEGEN: psllw $3
 
@@ -513,7 +513,7 @@ entry:
 define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
 entry:
   ; SSE2: shift32i8c
-  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2: cost of 4 {{.*}} shl
   ; SSE2-CODEGEN: shift32i8c
   ; SSE2-CODEGEN: psllw $3
 
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index 888164df75f5..6756f3ba2802 100644
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -120,7 +120,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX: Found an estimated cost of 56 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -282,7 +282,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX: Found an estimated cost of 56 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i16> %a, %splat
@@ -439,7 +439,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX: Found an estimated cost of 56 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -529,8 +529,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
-; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %shift
 }
@@ -568,7 +567,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
 }
@@ -578,9 +577,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 10 for instruction:   %shift
-; AVX512: Found an estimated cost of 10 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -590,10 +590,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 20 for instruction:   %shift
-; AVX512F: Found an estimated cost of 20 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512F: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
 }
@@ -605,7 +606,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512: Found an estimated cost of 4 for instruction:   %shift
-; XOP: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
 }
@@ -615,9 +616,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; SSE2: Found an estimated cost of 8 for instruction:   %shift
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
-; AVX2: Found an estimated cost of 24 for instruction:   %shift
-; AVX512: Found an estimated cost of 24 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
@@ -627,10 +629,11 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 16 for instruction:   %shift
 ; AVX: Found an estimated cost of 16 for instruction:   %shift
-; AVX2: Found an estimated cost of 48 for instruction:   %shift
-; AVX512F: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512F: Found an estimated cost of 8 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 16 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
 }
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index b3382253739f..63e6db194d52 100644
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -123,7 +123,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX: Found an estimated cost of 56 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -287,7 +287,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX: Found an estimated cost of 56 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i16> %a, %splat
@@ -447,7 +447,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX: Found an estimated cost of 56 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -501,8 +501,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
-; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <2 x i64> %a, <i64 7, i64 7>
   ret <2 x i64> %shift
 }
@@ -540,8 +539,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
-; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %shift
 }
@@ -579,7 +577,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
 }
@@ -589,9 +587,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; SSE2: Found an estimated cost of 2 for instruction:   %shift
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 10 for instruction:   %shift
-; AVX512: Found an estimated cost of 10 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -601,21 +600,22 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 20 for instruction:   %shift
-; AVX512F: Found an estimated cost of 20 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512F: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
 }
 
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
-; SSE2: Found an estimated cost of 1 for instruction:   %shift
-; SSE41: Found an estimated cost of 1 for instruction:   %shift
-; AVX: Found an estimated cost of 1 for instruction:   %shift
-; AVX2: Found an estimated cost of 1 for instruction:   %shift
-; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; SSE2: Found an estimated cost of 2 for instruction:   %shift
+; SSE41: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -623,25 +623,27 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 
 define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
-; SSE2: Found an estimated cost of 2 for instruction:   %shift
-; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 11 for instruction:   %shift
-; AVX512: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
 
 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
-; SSE2: Found an estimated cost of 4 for instruction:   %shift
-; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 22 for instruction:   %shift
-; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; SSE2: Found an estimated cost of 8 for instruction:   %shift
+; SSE41: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512F: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
-; XOP: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
 }
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index 804c5a76c319..8c42bd66c707 100644
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -57,8 +57,8 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
 ; SSE2: Found an estimated cost of 10 for instruction:   %shift
-; SSE41: Found an estimated cost of 10 for instruction:   %shift
-; AVX: Found an estimated cost of 10 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
@@ -70,8 +70,8 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
 ; SSE2: Found an estimated cost of 20 for instruction:   %shift
-; SSE41: Found an estimated cost of 20 for instruction:   %shift
-; AVX: Found an estimated cost of 20 for instruction:   %shift
+; SSE41: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
@@ -83,8 +83,8 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
 ; SSE2: Found an estimated cost of 40 for instruction:   %shift
-; SSE41: Found an estimated cost of 40 for instruction:   %shift
-; AVX: Found an estimated cost of 40 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
@@ -124,7 +124,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX: Found an estimated cost of 56 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -216,8 +216,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
 ; SSE2: Found an estimated cost of 10 for instruction:   %shift
-; SSE41: Found an estimated cost of 10 for instruction:   %shift
-; AVX: Found an estimated cost of 10 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
@@ -230,8 +230,8 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
 ; SSE2: Found an estimated cost of 20 for instruction:   %shift
-; SSE41: Found an estimated cost of 20 for instruction:   %shift
-; AVX: Found an estimated cost of 20 for instruction:   %shift
+; SSE41: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
@@ -244,8 +244,8 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
 ; SSE2: Found an estimated cost of 40 for instruction:   %shift
-; SSE41: Found an estimated cost of 40 for instruction:   %shift
-; AVX: Found an estimated cost of 40 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
@@ -288,7 +288,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX: Found an estimated cost of 56 for instruction:   %shift
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i16> %a, %splat
@@ -449,7 +449,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -607,7 +607,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -616,37 +616,39 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
-; SSE2: Found an estimated cost of 1 for instruction:   %shift
-; SSE41: Found an estimated cost of 1 for instruction:   %shift
-; AVX: Found an estimated cost of 1 for instruction:   %shift
-; AVX2: Found an estimated cost of 1 for instruction:   %shift
-; AVX512: Found an estimated cost of 1 for instruction:   %shift
-; XOP: Found an estimated cost of 1 for instruction:   %shift
+; SSE2: Found an estimated cost of 2 for instruction:   %shift
+; SSE41: Found an estimated cost of 2 for instruction:   %shift
+; AVX: Found an estimated cost of 2 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
 }
 
 define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
-; SSE2: Found an estimated cost of 2 for instruction:   %shift
-; SSE41: Found an estimated cost of 2 for instruction:   %shift
-; AVX: Found an estimated cost of 2 for instruction:   %shift
-; AVX2: Found an estimated cost of 11 for instruction:   %shift
-; AVX512: Found an estimated cost of 11 for instruction:   %shift
-; XOP: Found an estimated cost of 2 for instruction:   %shift
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
 
 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
-; SSE2: Found an estimated cost of 4 for instruction:   %shift
-; SSE41: Found an estimated cost of 4 for instruction:   %shift
-; AVX: Found an estimated cost of 4 for instruction:   %shift
-; AVX2: Found an estimated cost of 22 for instruction:   %shift
-; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; SSE2: Found an estimated cost of 8 for instruction:   %shift
+; SSE41: Found an estimated cost of 8 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512F: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
-; XOP: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
 }
diff --git a/test/Analysis/ScalarEvolution/invalidation.ll b/test/Analysis/ScalarEvolution/invalidation.ll
new file mode 100644
index 000000000000..1fcaddb525e6
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/invalidation.ll
@@ -0,0 +1,70 @@
+; Test that SCEV gets invalidated when one of its dependencies is invalidated.
+;
+; Each of the RUNs checks that the pass manager runs SCEV, then invalidates it
+; due to a dependency being invalidated, and then re-urns it. This will
+; directly fail and indicates a failure that would occur later if we ddidn't
+; invalidate SCEV in this way.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; RUN: opt < %s -passes='require<scalar-evolution>,invalidate<assumptions>,print<scalar-evolution>' \
+; RUN:     -debug-pass-manager -disable-output 2>&1 \
+; RUN:     | FileCheck %s -check-prefixes=CHECK,CHECK-AC-INVALIDATE
+;
+; CHECK-AC-INVALIDATE: Running pass: RequireAnalysisPass
+; CHECK-AC-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-AC-INVALIDATE: Running analysis: AssumptionAnalysis
+; CHECK-AC-INVALIDATE: Running pass: InvalidateAnalysisPass
+; CHECK-AC-INVALIDATE: Invalidating analysis: AssumptionAnalysis
+; CHECK-AC-INVALIDATE: Running pass: ScalarEvolutionPrinterPass
+; CHECK-AC-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-AC-INVALIDATE: Running analysis: AssumptionAnalysis
+
+; RUN: opt < %s -passes='require<scalar-evolution>,invalidate<domtree>,print<scalar-evolution>' \
+; RUN:     -debug-pass-manager -disable-output 2>&1 \
+; RUN:     | FileCheck %s -check-prefixes=CHECK,CHECK-DT-INVALIDATE
+;
+; CHECK-DT-INVALIDATE: Running pass: RequireAnalysisPass
+; CHECK-DT-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-DT-INVALIDATE: Running analysis: DominatorTreeAnalysis
+; CHECK-DT-INVALIDATE: Running pass: InvalidateAnalysisPass
+; CHECK-DT-INVALIDATE: Invalidating analysis: DominatorTreeAnalysis
+; CHECK-DT-INVALIDATE: Running pass: ScalarEvolutionPrinterPass
+; CHECK-DT-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-DT-INVALIDATE: Running analysis: DominatorTreeAnalysis
+
+; RUN: opt < %s -passes='require<scalar-evolution>,invalidate<loops>,print<scalar-evolution>' \
+; RUN:     -debug-pass-manager -disable-output 2>&1 \
+; RUN:     | FileCheck %s -check-prefixes=CHECK,CHECK-LI-INVALIDATE
+;
+; CHECK-LI-INVALIDATE: Running pass: RequireAnalysisPass
+; CHECK-LI-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-LI-INVALIDATE: Running analysis: LoopAnalysis
+; CHECK-LI-INVALIDATE: Running pass: InvalidateAnalysisPass
+; CHECK-LI-INVALIDATE: Invalidating analysis: LoopAnalysis
+; CHECK-LI-INVALIDATE: Running pass: ScalarEvolutionPrinterPass
+; CHECK-LI-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-LI-INVALIDATE: Running analysis: LoopAnalysis
+
+; This test isn't particularly interesting, its just enough to make sure we
+; actually do some work inside of SCEV so that if we regress here despite the
+; debug pass printing continuing to match, ASan and other tools can catch it.
+define void @test(i32 %n) {
+; CHECK-LABEL: Classifying expressions for: @test
+; CHECK: Loop %loop: backedge-taken count is 14
+; CHECK: Loop %loop: max backedge-taken count is 14
+; CHECK: Loop %loop: Predicated backedge-taken count is 14
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.inc, %loop ]
+  %iv.inc = add nsw i32 %iv, 3
+  %becond = icmp ne i32 %iv.inc, 46
+  br i1 %becond, label %loop, label %leave
+
+leave:
+  ret void
+}
diff --git a/test/Analysis/ValueTracking/assume.ll b/test/Analysis/ValueTracking/assume.ll
index 4bffe8ef7909..fe0ee53eb416 100644
--- a/test/Analysis/ValueTracking/assume.ll
+++ b/test/Analysis/ValueTracking/assume.ll
@@ -1,14 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define i32 @assume_add(i32 %a, i32 %b) {
 ; CHECK-LABEL: @assume_add(
-  %1 = add i32 %a, %b
-  %last_two_digits = and i32 %1, 3
-  %2 = icmp eq i32 %last_two_digits, 0
-  call void @llvm.assume(i1 %2)
-  %3 = add i32 %1, 3
-; CHECK: %3 = or i32 %1, 3
-  ret i32 %3
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[LAST_TWO_DIGITS:%.*]] = and i32 [[T1]], 3
+; CHECK-NEXT:    [[T2:%.*]] = icmp eq i32 [[LAST_TWO_DIGITS]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = or i32 [[T1]], 3
+; CHECK-NEXT:    ret i32 [[T3]]
+;
+  %t1 = add i32 %a, %b
+  %last_two_digits = and i32 %t1, 3
+  %t2 = icmp eq i32 %last_two_digits, 0
+  call void @llvm.assume(i1 %t2)
+  %t3 = add i32 %t1, 3
+  ret i32 %t3
 }
 
 declare void @llvm.assume(i1)
+
diff --git a/test/Bindings/Go/lit.local.cfg b/test/Bindings/Go/lit.local.cfg
index d68d867fb308..a587f88f54aa 100644
--- a/test/Bindings/Go/lit.local.cfg
+++ b/test/Bindings/Go/lit.local.cfg
@@ -6,7 +6,7 @@ import sys
 if not 'go' in config.root.llvm_bindings:
     config.unsupported = True
 
-if config.root.include_go_tests != 'ON':
+if not config.root.include_go_tests:
     config.unsupported = True
 
 def find_executable(executable, path=None):
diff --git a/test/Bindings/OCaml/lit.local.cfg b/test/Bindings/OCaml/lit.local.cfg
index 7a83ca142808..fd9e1c50e990 100644
--- a/test/Bindings/OCaml/lit.local.cfg
+++ b/test/Bindings/OCaml/lit.local.cfg
@@ -3,5 +3,5 @@ config.suffixes = ['.ml']
 if not 'ocaml' in config.root.llvm_bindings:
     config.unsupported = True
 
-if config.root.have_ocaml_ounit not in ('1', 'TRUE'):
+if not config.root.have_ocaml_ounit:
     config.unsupported = True
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 635197bc9ddd..c1667049f80f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,6 +1,14 @@
-if(LLVM_BUILD_EXAMPLES)
-  set(ENABLE_EXAMPLES 1)
-endif()
+llvm_canonicalize_cmake_booleans(
+  LLVM_TOOL_LTO_BUILD
+  HAVE_OCAMLOPT
+  HAVE_OCAML_OUNIT
+  LLVM_INCLUDE_GO_TESTS
+  LLVM_USE_INTEL_JITEVENTS
+  HAVE_LIBZ
+  HAVE_LIBXAR
+  LLVM_ENABLE_DIA_SDK
+  LLVM_ENABLE_FFI
+  BUILD_SHARED_LIBS)
 
 configure_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll
index 628d285141bc..eb79767e62be 100644
--- a/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -137,8 +137,8 @@ define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 
 ; v2i16 is naturally 4 byte aligned
 ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
-; TODO: This should use DST, but for some there are redundant MOVs
-; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal
+; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
+; EG: 16
 ; EG: 16
 define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
@@ -153,11 +153,11 @@ define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; GCN-DAG: s_sext_i32_i16
 
 ; v2i16 is naturally 4 byte aligned
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9].[XYZW]}},
 ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; TODO: We should also use ASHR instead of LSHR + BFE
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal
+; TODO: We should use ASHR instead of LSHR + BFE
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
 define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
@@ -167,16 +167,23 @@ define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
   ret void
 }
 
-; FUNC-LABEL: {{^}}constant_constant_zextload_v3i16_to_v3i32:
+; FUNC-LABEL: {{^}}constant_zextload_v3i16_to_v3i32:
 ; GCN: s_load_dwordx2
 
 ; v3i16 is naturally 8 byte aligned
-; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1
-; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}},
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}},
+; EG: CF_END
+; EG-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1
+; EG-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
 ; TODO: This should use DST, but for some there are redundant MOVs
-; EG: LSHR {{[* ]*}}{{T[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG: 16
-define void @constant_constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: 16
+; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: 65535
+; EG-DAG: 65535
+define void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -184,19 +191,20 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}constant_constant_sextload_v3i16_to_v3i32:
+; FUNC-LABEL: {{^}}constant_sextload_v3i16_to_v3i32:
 ; GCN: s_load_dwordx2
 
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}},
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}},
 ; v3i16 is naturally 8 byte aligned
-; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1
-; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: ASHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal
+; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[PTR:T[0-9]\.[XYZW]]], 0, #1
+; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
+; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -204,20 +212,24 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}constant_constant_zextload_v4i16_to_v4i32:
+; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i32:
 ; GCN: s_load_dwordx2
 ; GCN-DAG: s_and_b32
 ; GCN-DAG: s_lshr_b32
 
 ; v4i16 is naturally 8 byte aligned
-; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}
+; EG: VTX_READ_64 [[LD:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: This should use LD, but for some there are redundant MOVs
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*\.[XYZW]}}, literal
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*\.[XYZW]}}, literal
 ; EG-DAG: 16
-; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG-DAG: AND_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
 ; EG-DAG: 16
-define void @constant_constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+; EG-DAG: AND_INT {{[* ]*}}[[ST]].X, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: 65535
+; EG-DAG: 65535
+define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -230,13 +242,14 @@ define void @constant_constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %
 ; GCN-DAG: s_sext_i32_i16
 
 ; v4i16 is naturally 8 byte aligned
-; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EG: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: This should use LD, but for some there are redundant MOVs
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal
 ; TODO: We should use ASHR instead of LSHR + BFE
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
@@ -254,24 +267,27 @@ define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; GCN-DAG: s_lshr_b32
 
 ; v8i16 is naturally 16 byte aligned
-; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
+; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: These should use LSHR instead of BFE_UINT
+; TODO: This should use DST, but for some there are redundant MOVs
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
+; EG-DAG: 65535
+; EG-DAG: 65535
+; EG-DAG: 65535
+; EG-DAG: 65535
 define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
@@ -285,17 +301,19 @@ define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; GCN-DAG: s_sext_i32_i16
 
 ; v8i16 is naturally 16 byte aligned
-; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; TODO: We should use ASHR instead of LSHR + BFE
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
+; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: 4 of these should use ASHR instead of LSHR + BFE_INT
+; TODO: This should use DST, but for some there are redundant MOVs
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
@@ -444,7 +462,7 @@ define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
-; TODO: Why not 15 ?
+; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
 define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
@@ -468,7 +486,7 @@ define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
-; TODO: Why not 15 ?
+; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
 define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
index f398dd32e06d..7bd131e6516c 100644
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
 
 ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
 
@@ -10,7 +10,7 @@
 ; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}
 ; GCN-HSA: flat_load_ushort
 
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %ld = load i16, i16 addrspace(1)* %in
@@ -22,7 +22,7 @@ entry:
 ; GCN-NOHSA: buffer_load_dword v
 ; GCN-HSA: flat_load_dword v
 
-; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
@@ -34,8 +34,8 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx2 v
 ; GCN-HSA: flat_load_dwordx2 v
 
-; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
+; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
 define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
@@ -47,7 +47,7 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx2
 ; GCN-HSA: flat_load_dwordx2
 
-; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
 define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
@@ -59,7 +59,7 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 
-; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in
@@ -74,8 +74,8 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in
@@ -90,7 +90,7 @@ entry:
 ; GCN-HSA: flat_load_ushort
 ; GCN-HSA: flat_store_dword
 
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i32
@@ -105,9 +105,9 @@ define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; GCN-HSA: flat_load_sshort
 ; GCN-HSA: flat_store_dword
 
-; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
-; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; EG: 16
+; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
+; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EGCM: 16
 define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i32
@@ -119,7 +119,7 @@ define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; GCN-NOHSA: buffer_load_ushort
 ; GCN-HSA: flat_load_ushort
 
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
@@ -131,9 +131,9 @@ define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; GCN-NOHSA: buffer_load_sshort
 ; GCN-HSA: flat_load_sshort
 
-; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
-; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; EG: 16
+; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
+; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EGCM: 16
 define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
@@ -145,10 +145,9 @@ define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; GCN-NOHSA: buffer_load_dword
 ; GCN-HSA: flat_load_dword
 
-; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
-; TODO: This should use DST, but for some there are redundant MOVs
-; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal
-; EG: 16
+; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
+; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
+; EGCM: 16
 define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
@@ -161,13 +160,14 @@ define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 
 ; GCN-HSA: flat_load_dword
 
-; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; TODO: We should also use ASHR instead of LSHR + BFE
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal
-; EG-DAG: 16
-; EG-DAG: 16
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9]\.[XYZW]}},
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EGCM: VTX_READ_32 [[DST:T[0-9].[XYZW]]], [[DST]], 0, #1
+; TODO: This should use ASHR instead of LSHR + BFE
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal
+; EGCM-DAG: 16
+; EGCM-DAG: 16
 define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
@@ -175,16 +175,22 @@ define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_global_zextload_v3i16_to_v3i32:
+; FUNC-LABEL: {{^}}global_zextload_v3i16_to_v3i32:
 ; GCN-NOHSA: buffer_load_dwordx2
 ; GCN-HSA: flat_load_dwordx2
 
-; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1
-; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}},
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}},
+; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
 ; TODO: This should use DST, but for some there are redundant MOVs
-; EG: LSHR {{[* ]*}}{{T[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG: 16
-define void @global_global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+; EGCM: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
+; EGCM: 16
+; EGCM: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal
+; EGCM: AND_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], literal
+define void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -192,19 +198,23 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_global_sextload_v3i16_to_v3i32:
+; FUNC-LABEL: {{^}}global_sextload_v3i16_to_v3i32:
 ; GCN-NOHSA: buffer_load_dwordx2
 ; GCN-HSA: flat_load_dwordx2
 
-; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1
-; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: ASHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal
-; EG-DAG: 16
-; EG-DAG: 16
-define void @global_global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}},
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}},
+; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 4, #1
+; TODO: This should use DST, but for some there are redundant MOVs
+; EGCM-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+define void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -212,19 +222,22 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}global_global_zextload_v4i16_to_v4i32:
+; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i32:
 ; GCN-NOHSA: buffer_load_dwordx2
 
 ; GCN-HSA: flat_load_dwordx2
 
-; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG-DAG: 16
-; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG-DAG: AND_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG-DAG: 16
-define void @global_global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EGCM: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: This should use DST, but for some there are redundant MOVs
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*}}, literal
+; EGCM-DAG: 16
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal
+; EGCM-DAG: 16
+define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -236,17 +249,19 @@ define void @global_global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out,
 
 ; GCN-HSA: flat_load_dwordx2
 
-; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EGCM: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1
 ; TODO: We should use ASHR instead of LSHR + BFE
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
+; TODO: This should use DST, but for some there are redundant MOVs
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
 define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
@@ -258,16 +273,29 @@ define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 
-; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
+; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EGCM: CF_END
+; EGCM: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: These should use LSHR instead of BFE_UINT
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, literal
+; EGCM-DAG: 65535
+; EGCM-DAG: 65535
+; EGCM-DAG: 65535
+; EGCM-DAG: 65535
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
 define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
@@ -279,24 +307,29 @@ define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 
-; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
+; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EGCM: CF_END
+; EGCM: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: These should use ASHR instead of LSHR + BFE_INT
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, 0.0, literal
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
 define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
@@ -311,8 +344,8 @@ define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
 define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
@@ -322,8 +355,8 @@ define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i32:
 
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
 define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
@@ -342,10 +375,10 @@ define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
 define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
@@ -364,10 +397,10 @@ define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
 define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
@@ -394,14 +427,14 @@ define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
 define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
@@ -411,14 +444,14 @@ define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i32:
 
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
 define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
@@ -434,8 +467,8 @@ define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-; EG: MOV {{.*}}, 0.0
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: MOV {{.*}}, 0.0
 define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
@@ -458,10 +491,10 @@ define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
 
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
-; TODO: Why not 15 ?
-; EG: 31
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
+; TODO: These could be expanded earlier using ASHR 15
+; EGCM: 31
 define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i64
@@ -471,8 +504,8 @@ define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i16_to_v1i64:
 
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-; EG: MOV {{.*}}, 0.0
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: MOV {{.*}}, 0.0
 define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
@@ -482,10 +515,10 @@ define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i16_to_v1i64:
 
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
-; TODO: Why not 15 ?
-; EG: 31
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
+; TODO: These could be expanded earlier using ASHR 15
+; EGCM: 31
 define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
@@ -503,7 +536,7 @@ define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64:
 
-; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
@@ -513,7 +546,7 @@ define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64:
 
-; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
 define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
@@ -523,7 +556,7 @@ define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64:
 
-; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
 define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
@@ -533,7 +566,7 @@ define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64:
 
-; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
@@ -543,7 +576,7 @@ define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64:
 
-; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
@@ -553,8 +586,8 @@ define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i64:
 
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
@@ -564,8 +597,8 @@ define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i64:
 
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
@@ -575,10 +608,10 @@ define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i64:
 
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
 define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
@@ -588,10 +621,10 @@ define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i64:
 
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
 define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index 5d64a152af3c..13d56535303f 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -1,10 +1,9 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-
 ; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
-; SI: v_min_i32_e32
+; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
 define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -17,7 +16,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 }
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_i32:
-; SI: s_min_i32
+; GCN: s_min_i32
 
 ; EG: MIN_INT
 define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -28,7 +27,7 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 }
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_v1i32:
-; SI: s_min_i32
+; GCN: s_min_i32
 
 ; EG: MIN_INT
 define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
@@ -39,10 +38,10 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
 }
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i32:
-; SI: s_min_i32
-; SI: s_min_i32
-; SI: s_min_i32
-; SI: s_min_i32
+; GCN: s_min_i32
+; GCN: s_min_i32
+; GCN: s_min_i32
+; GCN: s_min_i32
 
 ; EG: MIN_INT
 ; EG: MIN_INT
@@ -56,11 +55,11 @@ define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
 }
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_i8:
-; SI: s_load_dword
-; SI: s_load_dword
-; SI: s_sext_i32_i8
-; SI: s_sext_i32_i8
-; SI: s_min_i32
+; GCN: s_load_dword
+; GCN: s_load_dword
+; GCN: s_sext_i32_i8
+; GCN: s_sext_i32_i8
+; GCN: s_min_i32
 define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
   %cmp = icmp sle i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
@@ -72,21 +71,26 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
 ; extloads with mubuf instructions.
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
 
 ; SI: v_min_i32
 ; SI: v_min_i32
 ; SI: v_min_i32
 ; SI: v_min_i32
 
-; SI: s_endpgm
+; VI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+
+; GCN: s_endpgm
 
 ; EG: MIN_INT
 ; EG: MIN_INT
@@ -117,7 +121,7 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <
 }
 
 ; FUNC-LABEL: @v_test_imin_slt_i32
-; SI: v_min_i32_e32
+; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
 define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -130,7 +134,7 @@ define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 }
 
 ; FUNC-LABEL: @s_test_imin_slt_i32
-; SI: s_min_i32
+; GCN: s_min_i32
 
 ; EG: MIN_INT
 define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -141,8 +145,8 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 }
 
 ; FUNC-LABEL: {{^}}s_test_imin_slt_v2i32:
-; SI: s_min_i32
-; SI: s_min_i32
+; GCN: s_min_i32
+; GCN: s_min_i32
 
 ; EG: MIN_INT
 ; EG: MIN_INT
@@ -154,7 +158,7 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <
 }
 
 ; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32:
-; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
 define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
@@ -165,7 +169,7 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32:
-; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
 define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
@@ -176,7 +180,7 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 }
 
 ; FUNC-LABEL: @v_test_umin_ule_i32
-; SI: v_min_u32_e32
+; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
 define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -189,11 +193,11 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 }
 
 ; FUNC-LABEL: @v_test_umin_ule_v3i32
-; SI: v_min_u32_e32
-; SI: v_min_u32_e32
-; SI: v_min_u32_e32
+; GCN: v_min_u32_e32
+; GCN: v_min_u32_e32
+; GCN: v_min_u32_e32
 ; SI-NOT: v_min_u32_e32
-; SI: s_endpgm
+; GCN: s_endpgm
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
@@ -207,7 +211,7 @@ define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrs
   ret void
 }
 ; FUNC-LABEL: @s_test_umin_ule_i32
-; SI: s_min_u32
+; GCN: s_min_u32
 
 ; EG: MIN_UINT
 define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -218,7 +222,7 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 }
 
 ; FUNC-LABEL: @v_test_umin_ult_i32
-; SI: v_min_u32_e32
+; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
 define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -231,9 +235,9 @@ define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 }
 
 ; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: v_min_u32_e32
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
 define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
@@ -246,7 +250,7 @@ define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
 }
 
 ; FUNC-LABEL: @s_test_umin_ult_i32
-; SI: s_min_u32
+; GCN: s_min_u32
 
 ; EG: MIN_UINT
 define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -258,10 +262,10 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 
 ; FUNC-LABEL: @v_test_umin_ult_i32_multi_use
 ; SI-NOT: v_min
-; SI: v_cmp_lt_u32
+; GCN: v_cmp_lt_u32
 ; SI-NEXT: v_cndmask_b32
 ; SI-NOT: v_min
-; SI: s_endpgm
+; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
 define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -274,9 +278,27 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace
   ret void
 }
 
+; FUNC-LABEL: @v_test_umin_ult_i16_multi_use
+; GCN-NOT: v_min
+; GCN: v_cmp_lt_u32
+; GCN-NEXT: v_cndmask_b32
+; GCN-NOT: v_min
+; GCN: s_endpgm
+
+; EG-NOT: MIN_UINT
+define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+  %a = load i16, i16 addrspace(1)* %aptr, align 2
+  %b = load i16, i16 addrspace(1)* %bptr, align 2
+  %cmp = icmp ult i16 %a, %b
+  %val = select i1 %cmp, i16 %a, i16 %b
+  store i16 %val, i16 addrspace(1)* %out0, align 2
+  store i1 %cmp, i1 addrspace(1)* %out1
+  ret void
+}
+
 
 ; FUNC-LABEL: @s_test_umin_ult_v1i32
-; SI: s_min_u32
+; GCN: s_min_u32
 
 ; EG: MIN_UINT
 define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
@@ -287,14 +309,14 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
 }
 
 ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i32:
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
@@ -312,14 +334,14 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
 }
 
 ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16:
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
@@ -338,11 +360,11 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <
 
 ; Make sure redundant and removed
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI: buffer_store_dword [[VMIN]]
+; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_UINT
 define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
@@ -358,11 +380,11 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1
 ; Make sure redundant sign_extend_inreg removed.
 
 ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI: buffer_store_dword [[VMIN]]
+; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_INT
 define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
@@ -377,7 +399,7 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16
 }
 
 ; FUNC-LABEL: {{^}}s_test_imin_sle_i16:
-; SI: s_min_i32
+; GCN: s_min_i32
 
 ; EG: MIN_INT
 define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
@@ -389,7 +411,7 @@ define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin
 
 ; 64 bit
 ; FUNC-LABEL: {{^}}test_umin_ult_i64
-; SI: s_endpgm
+; GCN: s_endpgm
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
@@ -401,7 +423,7 @@ define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}test_umin_ule_i64
-; SI: s_endpgm
+; GCN: s_endpgm
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
@@ -413,7 +435,7 @@ define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}test_imin_slt_i64
-; SI: s_endpgm
+; GCN: s_endpgm
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
@@ -425,7 +447,7 @@ define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 }
 
 ; FUNC-LABEL: {{^}}test_imin_sle_i64
-; SI: s_endpgm
+; GCN: s_endpgm
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
diff --git a/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
new file mode 100644
index 000000000000..866a4a9191e2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=r600 -mcpu=cypress -start-after safe-stack %s -o - | FileCheck %s
+; Don't crash
+
+; CHECK: MAX_UINT
+define void @test(i64 addrspace(1)* %out) {
+bb:
+  store i64 2, i64 addrspace(1)* %out
+  %tmp = load i64, i64 addrspace(1)* %out
+  br label %jump
+
+jump:                                             ; preds = %bb
+  %tmp1 = icmp ugt i64 %tmp, 4
+  %umax = select i1 %tmp1, i64 %tmp, i64 4
+  store i64 %umax, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store-private.ll b/test/CodeGen/AMDGPU/store-private.ll
new file mode 100644
index 000000000000..33d27f24e9cf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store-private.ll
@@ -0,0 +1,743 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}store_i1:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_byte
+define void @store_i1(i1 addrspace(0)* %out) {
+entry:
+  store i1 true, i1 addrspace(0)* %out
+  ret void
+}
+
+; i8 store
+; FUNC-LABEL: {{^}}store_i8:
+; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x
+; EG-NEXT: 2
+; EG: MOVA_INT * AR.x (MASKED)
+; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x
+
+; IG 0: Get the byte index and truncate the value
+; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG-NEXT: 3(4.203895e-45)
+; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.x
+; EG-NEXT: 255(3.573311e-43)
+
+; EG: NOT_INT
+; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]]
+; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]]
+; TODO: Is the reload necessary?
+; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]]
+; EG: MOV * T(0 + AR.x).X+, [[RES]]
+
+; SI: buffer_store_byte
+
+define void @store_i8(i8 addrspace(0)* %out, i8 %in) {
+entry:
+  store i8 %in, i8 addrspace(0)* %out
+  ret void
+}
+
+; i16 store
+; FUNC-LABEL: {{^}}store_i16:
+; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x
+; EG-NEXT: 2
+; EG: MOVA_INT * AR.x (MASKED)
+; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x
+
+; IG 0: Get the byte index and truncate the value
+; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG-NEXT: 3(4.203895e-45)
+; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.x
+; EG-NEXT: 65535(9.183409e-41)
+
+; EG: NOT_INT
+; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]]
+; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]]
+; TODO: Is the reload necessary?
+; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]]
+; EG: MOV * T(0 + AR.x).X+, [[RES]]
+
+; SI: buffer_store_short
+define void @store_i16(i16 addrspace(0)* %out, i16 %in) {
+entry:
+  store i16 %in, i16 addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_i24:
+; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_short
+
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store can be eliminated
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store can be eliminated
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+define void @store_i24(i24 addrspace(0)* %out, i24 %in) {
+entry:
+  store i24 %in, i24 addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_i25:
+; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}}
+; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]]
+; SI: buffer_store_dword [[VAND]]
+
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG-NOT: MOVA_INT
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM-NOT: MOVA_INT
+define void @store_i25(i25 addrspace(0)* %out, i25 %in) {
+entry:
+  store i25 %in, i25 addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v2i8:
+; v2i8 is naturally 2B aligned, treat as i16
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG-NOT: MOVA_INT
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM-NOT: MOVA_INT
+
+; SI: buffer_store_short
+define void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i8>
+  store <2 x i8> %0, <2 x i8> addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v2i8_unaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_byte
+define void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i8>
+  store <2 x i8> %0, <2 x i8> addrspace(0)* %out, align 1
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}store_v2i16:
+; v2i8 is naturally 2B aligned, treat as i16
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG-NOT: MOVA_INT
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM-NOT: MOVA_INT
+
+; SI: buffer_store_dword
+define void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i16>
+  store <2 x i16> %0, <2 x i16> addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v2i16_unaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_short
+; SI: buffer_store_short
+define void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i16>
+  store <2 x i16> %0, <2 x i16> addrspace(0)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i8:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG-NOT: MOVA_INT
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM-NOT: MOVA_INT
+
+; SI: buffer_store_dword
+define void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i8>
+  store <4 x i8> %0, <4 x i8> addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i8_unaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI-NOT: buffer_store_dword
+define void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i8>
+  store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v8i8_unaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI-NOT: buffer_store_dword
+define void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) {
+entry:
+  %0 = trunc <8 x i32> %in to <8 x i8>
+  store <8 x i8> %0, <8 x i8> addrspace(0)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i8_halfaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+;       they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI-NOT: buffer_store_dword
+define void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i8>
+  store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 2
+  ret void
+}
+
+; floating-point store
+; FUNC-LABEL: {{^}}store_f32:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_dword
+
+define void @store_f32(float addrspace(0)* %out, float %in) {
+  store float %in, float addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i16:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x2?
+; XSI: buffer_store_dwordx2
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i16>
+  store <4 x i16> %0, <4 x i16> addrspace(0)* %out
+  ret void
+}
+
+; vec2 floating-point stores
+; FUNC-LABEL: {{^}}store_v2f32:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x2?
+; XSI: buffer_store_dwordx2
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+
+define void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) {
+entry:
+  %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
+  %1 = insertelement <2 x float> %0, float %b, i32 1
+  store <2 x float> %1, <2 x float> addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v3i32:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x2?
+; XSI-DAG: buffer_store_dwordx2
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+
+define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
+  store <3 x i32> %a, <3 x i32> addrspace(0)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i32:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x4?
+; XSI: buffer_store_dwordx4
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i32_unaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x4?
+; XSI: buffer_store_dwordx4
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(0)* %out, align 4
+  ret void
+}
+
+; v4f32 store
+; FUNC-LABEL: {{^}}store_v4f32:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x4?
+; XSI: buffer_store_dwordx4
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) {
+  %1 = load <4 x float>, <4 x float> addrspace(0) * %in
+  store <4 x float> %1, <4 x float> addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_i64_i8:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_byte
+define void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i8
+  store i8 %0, i8 addrspace(0)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_i64_i16:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_short
+define void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i16
+  store i16 %0, i16 addrspace(0)* %out
+  ret void
+}
+
+; The stores in this function are combined by the optimizer to create a
+; 64-bit store with 32-bit alignment.  This is legal and the legalizer
+; should not try to split the 64-bit store back into 2 32-bit stores.
+
+; FUNC-LABEL: {{^}}vecload2:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x2?
+; XSI: buffer_store_dwordx2
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+entry:
+  %0 = load i32, i32 addrspace(2)* %mem, align 4
+  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
+  %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4
+  store i32 %0, i32 addrspace(0)* %out, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1
+  store i32 %1, i32 addrspace(0)* %arrayidx1, align 4
+  ret void
+}
+
+; When i128 was a legal type this program generated cannot select errors:
+
+; FUNC-LABEL: {{^}}"i128-const-store":
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x4?
+; XSI: buffer_store_dwordx4
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @i128-const-store(i32 addrspace(0)* %out) {
+entry:
+  store i32 1, i32 addrspace(0)* %out, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1
+  store i32 1, i32 addrspace(0)* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 2
+  store i32 2, i32 addrspace(0)* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 3
+  store i32 2, i32 addrspace(0)* %arrayidx6, align 4
+  ret void
+}
+
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AVR/intrinsics/read_register.ll b/test/CodeGen/AVR/intrinsics/read_register.ll
new file mode 100644
index 000000000000..3f28d1d3a9fe
--- /dev/null
+++ b/test/CodeGen/AVR/intrinsics/read_register.ll
@@ -0,0 +1,17 @@
+; RUN: llc -O0 < %s -march=avr | FileCheck %s
+
+; CHECK-LABEL: foo
+define void @foo() {
+entry:
+  %val1 = call i16 @llvm.read_register.i16(metadata !0)
+  %val2 = call i16 @llvm.read_register.i16(metadata !1)
+  %val3 = call i8 @llvm.read_register.i8(metadata !2)
+  ret void
+}
+
+declare i8 @llvm.read_register.i8(metadata)
+declare i16 @llvm.read_register.i16(metadata)
+
+!0 = !{!"r28"}
+!1 = !{!"Z"}
+!2 = !{!"r0"}
diff --git a/test/CodeGen/WebAssembly/function-bitcasts.ll b/test/CodeGen/WebAssembly/function-bitcasts.ll
new file mode 100644
index 000000000000..49980da6eb8f
--- /dev/null
+++ b/test/CodeGen/WebAssembly/function-bitcasts.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that function pointer casts are replaced with wrappers.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: test:
+; CHECK-NEXT: call        .Lbitcast@FUNCTION{{$}}
+; CHECK-NEXT: call        .Lbitcast.1@FUNCTION{{$}}
+; CHECK-NEXT: i32.const   $push[[L0:[0-9]+]]=, 0
+; CHECK-NEXT: call        .Lbitcast.2@FUNCTION, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.call    $drop=, .Lbitcast.3@FUNCTION{{$}}
+; CHECK-NEXT: call        foo2@FUNCTION{{$}}
+; CHECK-NEXT: call        foo3@FUNCTION{{$}}
+; CHECK-NEXT: .endfunc
+
+; CHECK-LABEL: .Lbitcast:
+; CHECK-NEXT: .local      i32
+; CHECK-NEXT: call        has_i32_arg@FUNCTION, $0{{$}}
+; CHECK-NEXT: .endfunc
+
+; CHECK-LABEL: .Lbitcast.1:
+; CHECK-NEXT: call        $drop=, has_i32_ret@FUNCTION{{$}}
+; CHECK-NEXT: .endfunc
+
+; CHECK-LABEL: .Lbitcast.2:
+; CHECK-NEXT: .param      i32
+; CHECK-NEXT: call        foo0@FUNCTION{{$}}
+; CHECK-NEXT: .endfunc
+
+; CHECK-LABEL: .Lbitcast.3:
+; CHECK-NEXT: .result     i32
+; CHECK-NEXT: .local      i32
+; CHECK-NEXT: call        foo1@FUNCTION{{$}}
+; CHECK-NEXT: copy_local  $push0=, $0
+; CHECK-NEXT: .endfunc
+
+declare void @has_i32_arg(i32)
+declare i32 @has_i32_ret()
+
+declare void @foo0()
+declare void @foo1()
+declare void @foo2()
+declare void @foo3()
+
+define void @test() {
+entry:
+  call void bitcast (void (i32)* @has_i32_arg to void ()*)()
+  call void bitcast (i32 ()* @has_i32_ret to void ()*)()
+  call void bitcast (void ()* @foo0 to void (i32)*)(i32 0)
+  %t = call i32 bitcast (void ()* @foo1 to i32 ()*)()
+  call void bitcast (void ()* @foo2 to void ()*)()
+  call void @foo3()
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
new file mode 100644
index 000000000000..ef4318ec299b
--- /dev/null
+++ b/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that function pointer casts that require conversions are not converted
+; to wrappers. In theory some conversions could be supported, but currently no
+; conversions are implemented.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: test:
+; CHECK-NEXT: i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: call        has_i64_arg@FUNCTION, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.call    $drop=, has_i64_ret@FUNCTION{{$}}
+; CHECK-NEXT: .endfunc
+
+; CHECK-NOT: .Lbitcast
+
+declare void @has_i64_arg(i64)
+declare i64 @has_i64_ret()
+
+define void @test() {
+entry:
+  call void bitcast (void (i64)* @has_i64_arg to void (i32)*)(i32 0)
+  %t = call i32 bitcast (i64 ()* @has_i64_ret to i32 ()*)()
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll
index e1341624cad3..aec74424b9b2 100644
--- a/test/CodeGen/X86/avx2-arith.ll
+++ b/test/CodeGen/X86/avx2-arith.ll
@@ -142,17 +142,108 @@ define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone
   ret <16 x i16> %x
 }
 
-define <16 x i8> @mul-v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
+define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
+; X32-LABEL: mul_v16i8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpmovsxbw %xmm1, %ymm1
+; X32-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X32-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X32-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; X32-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; X32-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_v16i8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpmovsxbw %xmm1, %ymm1
+; X64-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X64-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X64-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; X64-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
   %x = mul <16 x i8> %i, %j
   ret <16 x i8> %x
 }
 
-define <32 x i8> @mul-v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+; X32-LABEL: mul_v32i8:
+; X32:       ## BB#0:
+; X32-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; X32-NEXT:    vpmovsxbw %xmm2, %ymm2
+; X32-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; X32-NEXT:    vpmovsxbw %xmm3, %ymm3
+; X32-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; X32-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; X32-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X32-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; X32-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; X32-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-NEXT:    vpmovsxbw %xmm1, %ymm1
+; X32-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X32-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X32-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; X32-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; X32-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_v32i8:
+; X64:       ## BB#0:
+; X64-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; X64-NEXT:    vpmovsxbw %xmm2, %ymm2
+; X64-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; X64-NEXT:    vpmovsxbw %xmm3, %ymm3
+; X64-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
+; X64-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; X64-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X64-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; X64-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT:    vpmovsxbw %xmm1, %ymm1
+; X64-NEXT:    vpmovsxbw %xmm0, %ymm0
+; X64-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; X64-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = mul <32 x i8> %i, %j
   ret <32 x i8> %x
 }
 
-define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+; X32-LABEL: mul_v4i64:
+; X32:       ## BB#0:
+; X32-NEXT:    vpsrlq $32, %ymm0, %ymm2
+; X32-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
+; X32-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; X32-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; X32-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
+; X32-NEXT:    vpsllq $32, %ymm2, %ymm2
+; X32-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: mul_v4i64:
+; X64:       ## BB#0:
+; X64-NEXT:    vpsrlq $32, %ymm0, %ymm2
+; X64-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
+; X64-NEXT:    vpsrlq $32, %ymm1, %ymm3
+; X64-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
+; X64-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
+; X64-NEXT:    vpsllq $32, %ymm2, %ymm2
+; X64-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = mul <4 x i64> %i, %j
   ret <4 x i64> %x
 }
@@ -291,8 +382,8 @@ define <8 x i32> @mul_const9(<8 x i32> %x) {
   ret <8 x i32> %y
 }
 
+; %x * 0x01010101
 define <4 x i32> @mul_const10(<4 x i32> %x) {
-  ; %x * 0x01010101
 ; X32-LABEL: mul_const10:
 ; X32:       ## BB#0:
 ; X32-NEXT:    vpbroadcastd LCPI22_0, %xmm1
@@ -308,8 +399,8 @@ define <4 x i32> @mul_const10(<4 x i32> %x) {
   ret <4 x i32> %m
 }
 
+; %x * 0x80808080
 define <4 x i32> @mul_const11(<4 x i32> %x) {
-  ; %x * 0x80808080
 ; X32-LABEL: mul_const11:
 ; X32:       ## BB#0:
 ; X32-NEXT:    vpbroadcastd LCPI23_0, %xmm1
diff --git a/test/CodeGen/X86/avx512-bugfix-23634.ll b/test/CodeGen/X86/avx512-bugfix-23634.ll
index 0dcfb7c169f3..e66eefdb8e9f 100644
--- a/test/CodeGen/X86/avx512-bugfix-23634.ll
+++ b/test/CodeGen/X86/avx512-bugfix-23634.ll
@@ -15,7 +15,7 @@ define void @f_fu(float* %ret, float*  %aa, float %b) {
 ; CHECK-NEXT:    vpsrad $1, %zmm2, %zmm2
 ; CHECK-NEXT:    movw $-21846, %ax ## imm = 0xAAAA
 ; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpblendmd {{.*}}(%rip), %zmm1, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa32 {{.*}}(%rip), %zmm1 {%k1}
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index 532678ae72fa..1a91bc1dee9a 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -25,8 +25,7 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
@@ -48,8 +47,7 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
 ; KNL_X32-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL_X32-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL_X32-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL_X32-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL_X32-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL_X32-NEXT:    retl
   %c = and <16 x i1>%a, %b
@@ -65,8 +63,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovqw %zmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
@@ -88,8 +85,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
 ; KNL_X32-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL_X32-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL_X32-NEXT:    vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL_X32-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL_X32-NEXT:    vpmovqw %zmm0, %xmm0
 ; KNL_X32-NEXT:    retl
   %c = and <8 x i1>%a, %b
@@ -180,8 +176,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
 ; KNL-NEXT:  Lcfi1:
 ; KNL-NEXT:    .cfi_def_cfa_offset 16
 ; KNL-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    callq _func16xi1
 ; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -210,8 +205,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
 ; KNL_X32-NEXT:  Lcfi1:
 ; KNL_X32-NEXT:    .cfi_def_cfa_offset 16
 ; KNL_X32-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL_X32-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL_X32-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL_X32-NEXT:    calll _func16xi1
 ; KNL_X32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -285,8 +279,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
 ; KNL-NEXT:    movb $85, %al
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovqw %zmm0, %xmm0
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    retq
@@ -322,8 +315,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
 ; KNL_X32-NEXT:    movb $85, %al
 ; KNL_X32-NEXT:    kmovw %eax, %k1
 ; KNL_X32-NEXT:    vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL_X32-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL_X32-NEXT:    vpmovqw %zmm0, %xmm0
 ; KNL_X32-NEXT:    addl $12, %esp
 ; KNL_X32-NEXT:    retl
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index c2eb19d16650..5e50a3aef2f2 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -740,8 +740,7 @@ define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; KNL-NEXT:    retq
 ;
@@ -805,11 +804,10 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
 ; KNL-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; KNL-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
 ; KNL-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; KNL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vpmovqd %zmm1, %ymm1
 ; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm1
 ; KNL-NEXT:    retq
@@ -834,8 +832,7 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; KNL-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; KNL-NEXT:    retq
@@ -858,8 +855,7 @@ define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; KNL-NEXT:    vcmpltps %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; KNL-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index 32bd0804d637..03d6127ae5dc 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -345,9 +345,9 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
 ; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; KNL-NEXT:    vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
 ;
@@ -369,9 +369,9 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
 ; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpmovsxbd (%rdi), %ymm0
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    vpmovsxbd (%rdi), %ymm1
+; KNL-NEXT:    vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
 ;
@@ -704,9 +704,9 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind
 ; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT:    vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
 ;
@@ -728,9 +728,9 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw
 ; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpmovsxwd (%rdi), %ymm0
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    vpmovsxwd (%rdi), %ymm1
+; KNL-NEXT:    vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
 ;
@@ -762,9 +762,9 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind
 ; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1
 ; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT:    vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
 ;
@@ -1457,8 +1457,7 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
 ; KNL-LABEL: sext_16i1_16i32:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: sext_16i1_16i32:
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 26d14fa0840f..cb8ed0e59a3a 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -365,11 +365,10 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kmovw %esi, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2} {z}
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
-; KNL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; KNL-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; KNL-NEXT:    vpslld $31, %zmm2, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -402,11 +401,10 @@ define i8 @test17(i1 *%addr, i8 %a) {
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    kmovw %esi, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2} {z}
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; KNL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1242,30 +1240,29 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; KNL-NEXT:    vpextrd $1, %xmm0, %eax
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; KNL-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
 ; KNL-NEXT:    vmovd %xmm0, %eax
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k2} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,8,2,3,4,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
-; KNL-NEXT:    vpsllq $63, %zmm4, %zmm2
-; KNL-NEXT:    vptestmq %zmm2, %zmm2, %k2
-; KNL-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k2} {z}
-; KNL-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; KNL-NEXT:    vpsllq $63, %zmm4, %zmm2
-; KNL-NEXT:    vptestmq %zmm2, %zmm2, %k1
-; KNL-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; KNL-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; KNL-NEXT:    vpsllq $63, %zmm3, %zmm1
+; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k2
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; KNL-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; KNL-NEXT:    vpsllq $63, %zmm3, %zmm1
+; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vpextrd $3, %xmm0, %eax
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,8,4,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm0, %zmm2, %zmm1
-; KNL-NEXT:    vpsllq $63, %zmm1, %zmm0
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; KNL-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    retq
@@ -1306,11 +1303,10 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
 ; KNL-NEXT:    vmovq %xmm0, %rax
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2} {z}
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; KNL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index d48f63536e0e..b127585dc87b 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -344,8 +344,7 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
 ; KNL-NEXT:  LBB17_1:
 ; KNL-NEXT:    vpcmpgtd %zmm2, %zmm0, %k1
 ; KNL-NEXT:  LBB17_3:
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
@@ -382,8 +381,7 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
 ; KNL-NEXT:  LBB18_3:
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
@@ -472,8 +470,7 @@ define <16 x i1> @test15(i32 %x, i32 %y)  {
 ; KNL-NEXT:    movw $1, %cx
 ; KNL-NEXT:    cmovgw %ax, %cx
 ; KNL-NEXT:    kmovw %ecx, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
@@ -510,28 +507,27 @@ define <64 x i8> @test16(i64 %x) {
 ; KNL-NEXT:    movl %edi, (%rsp)
 ; KNL-NEXT:    shrq $32, %rdi
 ; KNL-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    kmovw (%rsp), %k1
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k2} {z}
-; KNL-NEXT:    vpmovdb %zmm2, %xmm2
-; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm2
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; KNL-NEXT:    movl $1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
-; KNL-NEXT:    vpsllw $7, %ymm2, %ymm0
-; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT:    vpmovdb %zmm2, %xmm2
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
@@ -574,30 +570,29 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; KNL-NEXT:    movl %edi, (%rsp)
 ; KNL-NEXT:    shrq $32, %rdi
 ; KNL-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
 ; KNL-NEXT:    kmovw (%rsp), %k1
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k2} {z}
-; KNL-NEXT:    vpmovdb %zmm2, %xmm2
-; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vpmovdb %zmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; KNL-NEXT:    xorl %eax, %eax
 ; KNL-NEXT:    cmpl %edx, %esi
 ; KNL-NEXT:    setg %al
 ; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
 ; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1} {z}
-; KNL-NEXT:    vpmovdb %zmm2, %xmm2
-; KNL-NEXT:    vmovdqa32 %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT:    vpmovdb %zmm2, %xmm2
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
@@ -635,18 +630,17 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kshiftlw $6, %k2, %k2
 ; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1} {z}
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k2} {z}
-; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7]
-; KNL-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; KNL-NEXT:    vpsllq $63, %zmm3, %zmm1
-; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
+; KNL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
 ; KNL-NEXT:    kshiftrw $1, %k1, %k1
 ; KNL-NEXT:    kshiftlw $7, %k0, %k0
 ; KNL-NEXT:    korw %k0, %k1, %k1
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovqw %zmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
@@ -1387,8 +1381,7 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) {
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: load_8i1:
@@ -1405,8 +1398,7 @@ define <16 x i32> @load_16i1(<16 x i1>* %a) {
 ; KNL-LABEL: load_16i1:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    kmovw (%rdi), %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: load_16i1:
@@ -1424,8 +1416,7 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) {
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
 ;
@@ -1444,8 +1435,7 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) {
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    movzbl (%rdi), %eax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; KNL-NEXT:    retq
@@ -1465,10 +1455,9 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) {
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    kmovw (%rdi), %k1
 ; KNL-NEXT:    kmovw 2(%rdi), %k2
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; KNL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL-NEXT:    vmovdqa32 %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
 ; KNL-NEXT:    vpmovdw %zmm1, %ymm1
 ; KNL-NEXT:    retq
 ;
@@ -1489,17 +1478,16 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) {
 ; KNL-NEXT:    kmovw 2(%rdi), %k2
 ; KNL-NEXT:    kmovw 4(%rdi), %k3
 ; KNL-NEXT:    kmovw 6(%rdi), %k4
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; KNL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k2} {z}
-; KNL-NEXT:    vpmovdb %zmm2, %xmm2
-; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; KNL-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k3} {z}
-; KNL-NEXT:    vpmovdb %zmm2, %xmm2
-; KNL-NEXT:    vmovdqa32 %zmm1, %zmm1 {%k4} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
+; KNL-NEXT:    vpmovdb %zmm1, %xmm1
+; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z}
+; KNL-NEXT:    vpmovdb %zmm2, %xmm2
+; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: load_64i1:
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 2a0de05608b4..9234ae838cff 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -313,7 +313,7 @@ define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x64,0x07]
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <16 x i32>*
@@ -327,7 +327,7 @@ define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x64,0x07]
+; CHECK-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <16 x i32>*
@@ -369,7 +369,7 @@ define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmq (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0x07]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <8 x i64>*
@@ -383,7 +383,7 @@ define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmq (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0x07]
+; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <8 x i64>*
@@ -426,7 +426,7 @@ define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1)
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
 ; CHECK-NEXT:    vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07]
 ; CHECK-NEXT:    vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04]
-; CHECK-NEXT:    vblendmps (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x65,0x07]
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = fcmp one <16 x float> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <16 x float>*
@@ -441,7 +441,7 @@ define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1)
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
 ; CHECK-NEXT:    vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07]
 ; CHECK-NEXT:    vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04]
-; CHECK-NEXT:    vblendmps (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x65,0x07]
+; CHECK-NEXT:    vmovups (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = fcmp one <16 x float> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <16 x float>*
@@ -486,7 +486,7 @@ define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1)
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
 ; CHECK-NEXT:    vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07]
 ; CHECK-NEXT:    vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04]
-; CHECK-NEXT:    vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x65,0x07]
+; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = fcmp one <8 x double> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <8 x double>*
@@ -501,7 +501,7 @@ define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1)
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
 ; CHECK-NEXT:    vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07]
 ; CHECK-NEXT:    vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04]
-; CHECK-NEXT:    vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x65,0x07]
+; CHECK-NEXT:    vmovupd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = fcmp one <8 x double> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <8 x double>*
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index ce8fca036c91..a29c1e4628a1 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -325,11 +325,13 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a)  {
 }
 
 ; X32-LABEL:  test_argRet128Vector:
-; X32:        vpblend{{.*}}  %xmm0, %xmm1, %xmm0
+; X32:        vmovdqa{{.*}}  %xmm0, %xmm1
+; X32:        vmovdqa{{.*}}  %xmm1, %xmm0
 ; X32:        ret{{.*}}
 
 ; WIN64-LABEL:  test_argRet128Vector:
-; WIN64:        vpblend{{.*}}  %xmm0, %xmm1, %xmm0
+; WIN64:        vmovdqa{{.*}}  %xmm0, %xmm1
+; WIN64:        vmovdqa{{.*}}  %xmm1, %xmm0
 ; WIN64:        ret{{.*}}
 
 ; Test regcall when receiving/returning 128 bit vector
@@ -341,13 +343,13 @@ define x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %b)
 ; X32-LABEL:  test_CallargRet128Vector:
 ; X32:        vmov{{.*}}  %xmm0, {{%xmm([0-7])}}
 ; X32:        call{{.*}}   {{.*}}test_argRet128Vector
-; X32:        vpblend{{.*}}  {{%xmm([0-7])}}, %xmm0, %xmm0
+; X32:        vmovdqa{{.*}}  {{%xmm([0-7])}}, %xmm0
 ; X32:        ret{{.*}}
 
 ; WIN64-LABEL:  test_CallargRet128Vector:
 ; WIN64:        vmov{{.*}}  %xmm0, {{%xmm([0-9]+)}}
 ; WIN64:        call{{.*}}   {{.*}}test_argRet128Vector
-; WIN64:        vpblend{{.*}}  {{%xmm([0-9]+)}}, %xmm0, %xmm0
+; WIN64:        vmovdqa{{.*}}  {{%xmm([0-9]+)}}, %xmm0
 ; WIN64:        ret{{.*}}
 
 ; Test regcall when passing/retrieving 128 bit vector
@@ -358,11 +360,13 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a)  {
 }
 
 ; X32-LABEL:  test_argRet256Vector:
-; X32:        vpblend{{.*}}  %ymm0, %ymm1, %ymm0
+; X32:        vmovdqa{{.*}}  %ymm0, %ymm1
+; X32:        vmovdqa{{.*}}  %ymm1, %ymm0
 ; X32:        ret{{.*}}
 
 ; WIN64-LABEL:  test_argRet256Vector:
-; WIN64:        vpblend{{.*}}  %ymm0, %ymm1, %ymm0
+; WIN64:        vmovdqa{{.*}}  %ymm0, %ymm1
+; WIN64:        vmovdqa{{.*}}  %ymm1, %ymm0
 ; WIN64:        ret{{.*}}
 
 ; Test regcall when receiving/returning 256 bit vector
@@ -374,13 +378,13 @@ define x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %b)
 ; X32-LABEL:  test_CallargRet256Vector:
 ; X32:        vmov{{.*}}  %ymm0, %ymm1
 ; X32:        call{{.*}}   {{.*}}test_argRet256Vector
-; X32:        vpblend{{.*}}  %ymm1, %ymm0, %ymm0
+; X32:        vmovdqa{{.*}}  %ymm1, %ymm0
 ; X32:        ret{{.*}}
 
 ; WIN64-LABEL:  test_CallargRet256Vector:
 ; WIN64:        vmov{{.*}}  %ymm0, %ymm1
 ; WIN64:        call{{.*}}   {{.*}}test_argRet256Vector
-; WIN64:        vpblend{{.*}}  %ymm1, %ymm0, %ymm0
+; WIN64:        vmovdqa{{.*}}  %ymm1, %ymm0
 ; WIN64:        ret{{.*}}
 
 ; Test regcall when passing/retrieving 256 bit vector
@@ -391,11 +395,13 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a)  {
 }
 
 ; X32-LABEL:  test_argRet512Vector:
-; X32:        vpblend{{.*}}  %zmm0, %zmm1, %zmm0
+; X32:        vmovdqa{{.*}}  %zmm0, %zmm1
+; X32:        vmovdqa{{.*}}  %zmm1, %zmm0
 ; X32:        ret{{.*}}
 
 ; WIN64-LABEL:  test_argRet512Vector:
-; WIN64:        vpblend{{.*}}  %zmm0, %zmm1, %zmm0
+; WIN64:        vmovdqa{{.*}}  %zmm0, %zmm1
+; WIN64:        vmovdqa{{.*}}  %zmm1, %zmm0
 ; WIN64:        ret{{.*}}
 
 ; Test regcall when receiving/returning 512 bit vector
@@ -407,13 +413,13 @@ define x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32>
 ; X32-LABEL:  test_CallargRet512Vector:
 ; X32:        vmov{{.*}}  %zmm0, %zmm1
 ; X32:        call{{.*}}   {{.*}}test_argRet512Vector
-; X32:        vpblend{{.*}}  %zmm1, %zmm0, %zmm0
+; X32:        movdqa{{.*}}  %zmm1, %zmm0
 ; X32:        ret{{.*}}
 
 ; WIN64-LABEL:  test_CallargRet512Vector:
 ; WIN64:        vmov{{.*}}  %zmm0, %zmm1
 ; WIN64:        call{{.*}}   {{.*}}test_argRet512Vector
-; WIN64:        vpblend{{.*}}  %zmm1, %zmm0, %zmm0
+; WIN64:        vmovdqa{{.*}}  %zmm1, %zmm0
 ; WIN64:        ret{{.*}}
 
 ; Test regcall when passing/retrieving 512 bit vector
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 840239b9011a..1991ee4f3376 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -218,8 +218,7 @@ define <16 x i32> @test_vbroadcast() {
 ; ALL:       # BB#0: # %entry
 ; ALL-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; ALL-NEXT:    vcmpunordps %zmm0, %zmm0, %k1
-; ALL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; ALL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; ALL-NEXT:    knotw %k1, %k1
 ; ALL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
 ; ALL-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index bd269ea87a35..361ee1ddbf9d 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -6,7 +6,8 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpleps %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = fcmp ole <16 x float> %x, %y
   %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
@@ -17,7 +18,8 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmplepd %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = fcmp ole <8 x double> %x, %y
   %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
@@ -28,7 +30,8 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
 ; CHECK-LABEL: test3:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqd (%rdi), %zmm0, %k1
-; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %y = load <16 x i32>, <16 x i32>* %yp, align 4
   %mask = icmp eq <16 x i32> %x, %y
@@ -40,7 +43,8 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1)
 ; CHECK-LABEL: test4_unsigned:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp uge <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
@@ -51,7 +55,8 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; CHECK-LABEL: test5:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp eq <8 x i64> %x, %y
   %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
@@ -62,7 +67,8 @@ define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) noun
 ; CHECK-LABEL: test6_unsigned:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp ugt <8 x i64> %x, %y
   %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
@@ -81,7 +87,8 @@ define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vcmpltps %xmm2, %xmm0, %k1
-; SKX-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
+; SKX-NEXT:    vmovaps %xmm1, %xmm0
 ; SKX-NEXT:    retq
 
   %mask = fcmp olt <4 x float> %a, zeroinitializer
@@ -101,7 +108,8 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    vcmpltpd %xmm2, %xmm0, %k1
-; SKX-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
+; SKX-NEXT:    vmovapd %xmm1, %xmm0
 ; SKX-NEXT:    retq
   %mask = fcmp olt <2 x double> %a, zeroinitializer
   %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
@@ -114,14 +122,15 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; KNL-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
-; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vmovdqa %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test9:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
-; SKX-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0
 ; SKX-NEXT:    retq
   %mask = icmp eq <8 x i32> %x, %y
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
@@ -134,14 +143,15 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
 ; KNL-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
-; KNL-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vmovaps %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test10:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
-; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
+; SKX-NEXT:    vmovaps %ymm1, %ymm0
 ; SKX-NEXT:    retq
 
   %mask = fcmp oeq <8 x float> %x, %y
@@ -658,9 +668,9 @@ define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
 define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
 ; CHECK-LABEL: test14:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpsubd	%zmm1, %zmm0, %zmm2
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm2
 ; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm2, %k1
-; CHECK-NEXT:    vpsubd	%zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %sub_r = sub <16 x i32> %a, %b
   %cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
@@ -673,9 +683,9 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
 define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
 ; CHECK-LABEL: test15:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpsubq	%zmm1, %zmm0, %zmm2
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm2
 ; CHECK-NEXT:    vpcmpgtq %zmm0, %zmm2, %k1
-; CHECK-NEXT:    vpsubq	%zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %sub_r = sub <8 x i64> %a, %b
   %cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a
@@ -689,7 +699,8 @@ define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind
 ; CHECK-LABEL: test16:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled %zmm0, %zmm1, %k1
-; CHECK-NEXT:    vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp sge <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
@@ -700,7 +711,8 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test17:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1
-; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask = icmp sgt <16 x i32> %x, %y
@@ -712,7 +724,8 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test18:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled (%rdi), %zmm0, %k1
-; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask = icmp sle <16 x i32> %x, %y
@@ -724,7 +737,8 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test19:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1
-; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask = icmp ule <16 x i32> %x, %y
@@ -737,7 +751,8 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
-; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp eq <16 x i32> %x1, %y1
   %mask0 = icmp eq <16 x i32> %x, %y
@@ -751,7 +766,8 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpcmpleq %zmm2, %zmm3, %k1 {%k1}
-; CHECK-NEXT:    vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <8 x i64> %x1, %y1
   %mask0 = icmp sle <8 x i64> %x, %y
@@ -765,7 +781,8 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtq %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sgt <8 x i64> %x1, %y1
   %y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
@@ -780,7 +797,8 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled %zmm1, %zmm2, %k1
 ; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <16 x i32> %x1, %y1
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
@@ -794,7 +812,8 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
 ; CHECK-LABEL: test24:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k1
-; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %yb = load i64, i64* %yb.ptr, align 4
   %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -808,7 +827,8 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind
 ; CHECK-LABEL: test25:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled (%rdi){1to16}, %zmm0, %k1
-; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %yb = load i32, i32* %yb.ptr, align 4
   %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -823,7 +843,8 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled %zmm1, %zmm2, %k1
 ; CHECK-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <16 x i32> %x1, %y1
   %yb = load i32, i32* %yb.ptr, align 4
@@ -840,7 +861,8 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleq %zmm1, %zmm2, %k1
 ; CHECK-NEXT:    vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <8 x i64> %x1, %y1
   %yb = load i64, i64* %yb.ptr, align 4
@@ -858,8 +880,7 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1
 ; KNL-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
 ; KNL-NEXT:    vpcmpgtq %zmm3, %zmm2, %k1
 ; KNL-NEXT:    kxnorw %k1, %k0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    retq
 ;
@@ -883,8 +904,7 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32>
 ; KNL-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; KNL-NEXT:    vpcmpgtd %zmm3, %zmm2, %k1
 ; KNL-NEXT:    kxorw %k1, %k0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
@@ -912,7 +932,8 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
 ; SKX-LABEL: test30:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
-; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    vmovapd %ymm0, %ymm1 {%k1}
+; SKX-NEXT:    vmovapd %ymm1, %ymm0
 ; SKX-NEXT:    retq
 
   %mask = fcmp oeq <4 x double> %x, %y
@@ -930,7 +951,8 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp
 ; SKX-LABEL: test31:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi), %xmm0, %k1
-; SKX-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
+; SKX-NEXT:    vmovapd %xmm1, %xmm0
 ; SKX-NEXT:    retq
 
   %y = load <2 x double>, <2 x double>* %yp, align 4
@@ -949,7 +971,8 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp
 ; SKX-LABEL: test32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi), %ymm0, %k1
-; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    vmovapd %ymm0, %ymm1 {%k1}
+; SKX-NEXT:    vmovapd %ymm1, %ymm0
 ; SKX-NEXT:    retq
 
   %y = load <4 x double>, <4 x double>* %yp, align 4
@@ -962,7 +985,8 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp
 ; CHECK-LABEL: test33:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpltpd (%rdi), %zmm0, %k1
-; CHECK-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %y = load <8 x double>, <8 x double>* %yp, align 4
   %mask = fcmp olt <8 x double> %x, %y
@@ -980,7 +1004,8 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no
 ; SKX-LABEL: test34:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi), %xmm0, %k1
-; SKX-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
+; SKX-NEXT:    vmovaps %xmm1, %xmm0
 ; SKX-NEXT:    retq
   %y = load <4 x float>, <4 x float>* %yp, align 4
   %mask = fcmp olt <4 x float> %x, %y
@@ -995,14 +1020,15 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vmovups (%rdi), %ymm2
 ; KNL-NEXT:    vcmpltps %zmm2, %zmm0, %k1
-; KNL-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vmovaps %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test35:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi), %ymm0, %k1
-; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
+; SKX-NEXT:    vmovaps %ymm1, %ymm0
 ; SKX-NEXT:    retq
 
   %y = load <8 x float>, <8 x float>* %yp, align 4
@@ -1015,7 +1041,8 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp
 ; CHECK-LABEL: test36:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpltps (%rdi), %zmm0, %k1
-; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %y = load <16 x float>, <16 x float>* %yp, align 4
   %mask = fcmp olt <16 x float> %x, %y
@@ -1027,7 +1054,8 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou
 ; CHECK-LABEL: test37:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1
-; CHECK-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
 
   %a = load double, double* %ptr
@@ -1050,7 +1078,8 @@ define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nou
 ; SKX-LABEL: test38:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi){1to4}, %ymm0, %k1
-; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    vmovapd %ymm0, %ymm1 {%k1}
+; SKX-NEXT:    vmovapd %ymm1, %ymm0
 ; SKX-NEXT:    retq
 
   %a = load double, double* %ptr
@@ -1073,7 +1102,8 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou
 ; SKX-LABEL: test39:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi){1to2}, %xmm0, %k1
-; SKX-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
+; SKX-NEXT:    vmovapd %xmm1, %xmm0
 ; SKX-NEXT:    retq
 
   %a = load double, double* %ptr
@@ -1090,7 +1120,8 @@ define <16  x float> @test40(<16  x float> %x, <16  x float> %x1, float* %ptr) n
 ; CHECK-LABEL: test40:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpltps (%rdi){1to16}, %zmm0, %k1
-; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
 
   %a = load float, float* %ptr
@@ -1109,14 +1140,15 @@ define <8  x float> @test41(<8  x float> %x, <8  x float> %x1, float* %ptr) noun
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vbroadcastss (%rdi), %ymm2
 ; KNL-NEXT:    vcmpltps %zmm2, %zmm0, %k1
-; KNL-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vmovaps %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test41:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi){1to8}, %ymm0, %k1
-; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
+; SKX-NEXT:    vmovaps %ymm1, %ymm0
 ; SKX-NEXT:    retq
 
   %a = load float, float* %ptr
@@ -1139,7 +1171,8 @@ define <4  x float> @test42(<4  x float> %x, <4  x float> %x1, float* %ptr) noun
 ; SKX-LABEL: test42:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi){1to4}, %xmm0, %k1
-; SKX-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
+; SKX-NEXT:    vmovaps %xmm1, %xmm0
 ; SKX-NEXT:    retq
 
   %a = load float, float* %ptr
@@ -1158,7 +1191,8 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
 ; KNL-NEXT:    vpsllq $63, %zmm2, %zmm2
 ; KNL-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; KNL-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
-; KNL-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vmovapd %zmm1, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test43:
@@ -1166,7 +1200,8 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
 ; SKX-NEXT:    vpsllw $15, %xmm2, %xmm2
 ; SKX-NEXT:    vpmovw2m %xmm2, %k1
 ; SKX-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
-; SKX-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
 ; SKX-NEXT:    retq
 
   %a = load double, double* %ptr
diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll
index c58b3cc8c3cd..11bb431414a0 100644
--- a/test/CodeGen/X86/avx512bw-mov.ll
+++ b/test/CodeGen/X86/avx512bw-mov.ll
@@ -26,7 +26,7 @@ define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vpcmpneqb %zmm2, %zmm1, %k1
-; CHECK-NEXT:    vpblendmb (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <64 x i8> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <64 x i8>*
@@ -74,7 +74,7 @@ define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vpcmpneqw %zmm2, %zmm1, %k1
-; CHECK-NEXT:    vpblendmw (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %mask = icmp ne <32 x i16> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <32 x i16>*
diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll
index 016837e61307..34432468921b 100644
--- a/test/CodeGen/X86/avx512bw-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll
@@ -5,7 +5,8 @@ define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp eq <64 x i8> %x, %y
   %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
@@ -16,7 +17,8 @@ define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp sgt <64 x i8> %x, %y
   %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
@@ -27,7 +29,8 @@ define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind
 ; CHECK-LABEL: test3:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmplew %zmm0, %zmm1, %k1
-; CHECK-NEXT:    vpblendmw %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp sge <32 x i16> %x, %y
   %max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y
@@ -38,7 +41,8 @@ define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
 ; CHECK-LABEL: test4:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask = icmp ugt <64 x i8> %x, %y
   %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
@@ -49,7 +53,8 @@ define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwin
 ; CHECK-LABEL: test5:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqw (%rdi), %zmm0, %k1
-; CHECK-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %y = load <32 x i16>, <32 x i16>* %yp, align 4
   %mask = icmp eq <32 x i16> %x, %y
@@ -61,7 +66,8 @@ define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun
 ; CHECK-LABEL: test6:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtw (%rdi), %zmm0, %k1
-; CHECK-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
   %mask = icmp sgt <32 x i16> %x, %y
@@ -73,7 +79,8 @@ define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun
 ; CHECK-LABEL: test7:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmplew (%rdi), %zmm0, %k1
-; CHECK-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
   %mask = icmp sle <32 x i16> %x, %y
@@ -85,7 +92,8 @@ define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun
 ; CHECK-LABEL: test8:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleuw (%rdi), %zmm0, %k1
-; CHECK-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
   %mask = icmp ule <32 x i16> %x, %y
@@ -98,7 +106,8 @@ define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 {%k1}
-; CHECK-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp eq <32 x i16> %x1, %y1
   %mask0 = icmp eq <32 x i16> %x, %y
@@ -112,7 +121,8 @@ define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleb %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpcmpleb %zmm2, %zmm3, %k1 {%k1}
-; CHECK-NEXT:    vpblendmb %zmm0, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <64 x i8> %x1, %y1
   %mask0 = icmp sle <64 x i8> %x, %y
@@ -126,7 +136,8 @@ define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtb %zmm2, %zmm1, %k1
 ; CHECK-NEXT:    vpcmpgtb (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sgt <64 x i8> %x1, %y1
   %y = load <64 x i8>, <64 x i8>* %y.ptr, align 4
@@ -141,7 +152,8 @@ define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmplew %zmm1, %zmm2, %k1
 ; CHECK-NEXT:    vpcmpleuw (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <32 x i16> %x1, %y1
   %y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll
index 209f18ba7f9c..3f92641a3e16 100644
--- a/test/CodeGen/X86/avx512bwvl-mov.ll
+++ b/test/CodeGen/X86/avx512bwvl-mov.ll
@@ -26,7 +26,7 @@ define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqb %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xca,0x04]
-; CHECK-NEXT:    vpblendmb (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x66,0x07]
+; CHECK-NEXT:    vmovdqu8 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <32 x i8> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <32 x i8>*
@@ -74,7 +74,7 @@ define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqw %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x3f,0xca,0x04]
-; CHECK-NEXT:    vpblendmw (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x66,0x07]
+; CHECK-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i16> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <16 x i16>*
@@ -122,7 +122,7 @@ define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x3f,0xca,0x04]
-; CHECK-NEXT:    vpblendmb (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x66,0x07]
+; CHECK-NEXT:    vmovdqu8 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <16 x i8> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <16 x i8>*
@@ -170,7 +170,7 @@ define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqw %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x3f,0xca,0x04]
-; CHECK-NEXT:    vpblendmw (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x66,0x07]
+; CHECK-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i16> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <8 x i16>*
diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
index 17e581bbb501..3e7f0acae78b 100644
--- a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
@@ -5,7 +5,8 @@ define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: test256_1:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k1
-; CHECK-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask = icmp eq <32 x i8> %x, %y
   %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y
@@ -16,7 +17,8 @@ define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind
 ; CHECK-LABEL: test256_2:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k1
-; CHECK-NEXT:    vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %mask = icmp sgt <32 x i8> %x, %y
   %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
@@ -27,7 +29,8 @@ define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounw
 ; CHECK-LABEL: test256_3:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmplew %ymm0, %ymm1, %k1
-; CHECK-NEXT:    vpblendmw %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask = icmp sge <16 x i16> %x, %y
   %max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y
@@ -38,7 +41,8 @@ define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind
 ; CHECK-LABEL: test256_4:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnleub %ymm1, %ymm0, %k1
-; CHECK-NEXT:    vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %mask = icmp ugt <32 x i8> %x, %y
   %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
@@ -49,7 +53,8 @@ define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nou
 ; CHECK-LABEL: test256_5:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqw (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <16 x i16>, <16 x i16>* %yp, align 4
   %mask = icmp eq <16 x i16> %x, %y
@@ -61,7 +66,8 @@ define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr)
 ; CHECK-LABEL: test256_6:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtw (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
   %mask = icmp sgt <16 x i16> %x, %y
@@ -73,7 +79,8 @@ define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr)
 ; CHECK-LABEL: test256_7:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmplew (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
   %mask = icmp sle <16 x i16> %x, %y
@@ -85,7 +92,8 @@ define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr)
 ; CHECK-LABEL: test256_8:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleuw (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
   %mask = icmp ule <16 x i16> %x, %y
@@ -98,7 +106,8 @@ define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 {%k1}
-; CHECK-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp eq <16 x i16> %x1, %y1
   %mask0 = icmp eq <16 x i16> %x, %y
@@ -112,7 +121,8 @@ define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleb %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpcmpleb %ymm2, %ymm3, %k1 {%k1}
-; CHECK-NEXT:    vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <32 x i8> %x1, %y1
   %mask0 = icmp sle <32 x i8> %x, %y
@@ -126,7 +136,8 @@ define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtb %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpcmpgtb (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sgt <32 x i8> %x1, %y1
   %y = load <32 x i8>, <32 x i8>* %y.ptr, align 4
@@ -141,7 +152,8 @@ define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1,
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmplew %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vpcmpleuw (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <16 x i16> %x1, %y1
   %y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
@@ -155,7 +167,8 @@ define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-LABEL: test128_1:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask = icmp eq <16 x i8> %x, %y
   %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y
@@ -166,7 +179,8 @@ define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind
 ; CHECK-LABEL: test128_2:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %mask = icmp sgt <16 x i8> %x, %y
   %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
@@ -177,7 +191,8 @@ define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind
 ; CHECK-LABEL: test128_3:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmplew %xmm0, %xmm1, %k1
-; CHECK-NEXT:    vpblendmw %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm2, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask = icmp sge <8 x i16> %x, %y
   %max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y
@@ -188,7 +203,8 @@ define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind
 ; CHECK-LABEL: test128_4:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnleub %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %mask = icmp ugt <16 x i8> %x, %y
   %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
@@ -199,7 +215,8 @@ define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwin
 ; CHECK-LABEL: test128_5:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqw (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i16>, <8 x i16>* %yp, align 4
   %mask = icmp eq <8 x i16> %x, %y
@@ -211,7 +228,8 @@ define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun
 ; CHECK-LABEL: test128_6:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtw (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
   %mask = icmp sgt <8 x i16> %x, %y
@@ -223,7 +241,8 @@ define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun
 ; CHECK-LABEL: test128_7:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmplew (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
   %mask = icmp sle <8 x i16> %x, %y
@@ -235,7 +254,8 @@ define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun
 ; CHECK-LABEL: test128_8:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleuw (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
   %mask = icmp ule <8 x i16> %x, %y
@@ -248,7 +268,8 @@ define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16>
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
 ; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 {%k1}
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp eq <8 x i16> %x1, %y1
   %mask0 = icmp eq <8 x i16> %x, %y
@@ -262,7 +283,8 @@ define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleb %xmm1, %xmm0, %k1
 ; CHECK-NEXT:    vpcmpleb %xmm2, %xmm3, %k1 {%k1}
-; CHECK-NEXT:    vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <16 x i8> %x1, %y1
   %mask0 = icmp sle <16 x i8> %x, %y
@@ -276,7 +298,8 @@ define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtb %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vpcmpgtb (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sgt <16 x i8> %x1, %y1
   %y = load <16 x i8>, <16 x i8>* %y.ptr, align 4
@@ -291,7 +314,8 @@ define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmplew %xmm1, %xmm2, %k1
 ; CHECK-NEXT:    vpcmpleuw (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <8 x i16> %x1, %y1
   %y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll
index e37fd76377e3..af449d6628c4 100644
--- a/test/CodeGen/X86/avx512vl-mov.ll
+++ b/test/CodeGen/X86/avx512vl-mov.ll
@@ -166,7 +166,7 @@ define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x64,0x07]
+; CHECK-NEXT:    vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <8 x i32>*
@@ -180,7 +180,7 @@ define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x64,0x07]
+; CHECK-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <8 x i32>*
@@ -222,7 +222,7 @@ define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmq (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x64,0x07]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <4 x i64>*
@@ -236,7 +236,7 @@ define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmq (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x64,0x07]
+; CHECK-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <4 x i64>*
@@ -279,7 +279,7 @@ define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07]
 ; CHECK-NEXT:    vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04]
-; CHECK-NEXT:    vblendmps (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x65,0x07]
+; CHECK-NEXT:    vmovaps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = fcmp one <8 x float> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <8 x float>*
@@ -294,7 +294,7 @@ define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07]
 ; CHECK-NEXT:    vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04]
-; CHECK-NEXT:    vblendmps (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x65,0x07]
+; CHECK-NEXT:    vmovups (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = fcmp one <8 x float> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <8 x float>*
@@ -338,7 +338,7 @@ define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT:    vblendmpd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x65,0x07]
+; CHECK-NEXT:    vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <4 x double>*
@@ -352,7 +352,7 @@ define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT:    vblendmpd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x65,0x07]
+; CHECK-NEXT:    vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <4 x double>*
@@ -554,7 +554,7 @@ define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x64,0x07]
+; CHECK-NEXT:    vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <4 x i32>*
@@ -568,7 +568,7 @@ define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x64,0x07]
+; CHECK-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <4 x i32>*
@@ -610,7 +610,7 @@ define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmq (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x64,0x07]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <2 x i64>*
@@ -624,7 +624,7 @@ define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT:    vpblendmq (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x64,0x07]
+; CHECK-NEXT:    vmovdqu64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <2 x i64>*
@@ -666,7 +666,7 @@ define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT:    vblendmps (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x65,0x07]
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <4 x float>*
@@ -680,7 +680,7 @@ define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT:    vblendmps (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x65,0x07]
+; CHECK-NEXT:    vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i32> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <4 x float>*
@@ -722,7 +722,7 @@ define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT:    vblendmpd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x65,0x07]
+; CHECK-NEXT:    vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <2 x double>*
@@ -736,7 +736,7 @@ define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT:    vblendmpd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x65,0x07]
+; CHECK-NEXT:    vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
   %vaddr = bitcast i8* %addr to <2 x double>*
diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
index e0acf2be653e..25b9cc79096f 100644
--- a/test/CodeGen/X86/avx512vl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -5,7 +5,8 @@ define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: test256_1:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask = icmp eq <4 x i64> %x, %y
   %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
@@ -16,7 +17,8 @@ define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind
 ; CHECK-LABEL: test256_2:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
-; CHECK-NEXT:    vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask = icmp sgt <4 x i64> %x, %y
   %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
@@ -27,7 +29,8 @@ define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind
 ; CHECK-LABEL: test256_3:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled %ymm0, %ymm1, %k1
-; CHECK-NEXT:    vpblendmd %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask = icmp sge <8 x i32> %x, %y
   %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
@@ -38,7 +41,8 @@ define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind
 ; CHECK-LABEL: test256_4:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnleuq %ymm1, %ymm0, %k1
-; CHECK-NEXT:    vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask = icmp ugt <4 x i64> %x, %y
   %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
@@ -49,7 +53,8 @@ define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwin
 ; CHECK-LABEL: test256_5:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %yp, align 4
   %mask = icmp eq <8 x i32> %x, %y
@@ -61,7 +66,8 @@ define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
 ; CHECK-LABEL: test256_5b:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqd (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %yp, align 4
   %mask = icmp eq <8 x i32> %y, %x
@@ -73,7 +79,8 @@ define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
 ; CHECK-LABEL: test256_6:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtd (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
   %mask = icmp sgt <8 x i32> %x, %y
@@ -85,7 +92,8 @@ define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test256_6b:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtd (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
   %mask = icmp slt <8 x i32> %y, %x
@@ -97,7 +105,8 @@ define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
 ; CHECK-LABEL: test256_7:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
   %mask = icmp sle <8 x i32> %x, %y
@@ -109,7 +118,8 @@ define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test256_7b:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
   %mask = icmp sge <8 x i32> %y, %x
@@ -121,7 +131,8 @@ define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
 ; CHECK-LABEL: test256_8:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
   %mask = icmp ule <8 x i32> %x, %y
@@ -133,7 +144,8 @@ define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test256_8b:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
   %mask = icmp uge <8 x i32> %y, %x
@@ -146,7 +158,8 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32>
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 {%k1}
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp eq <8 x i32> %x1, %y1
   %mask0 = icmp eq <8 x i32> %x, %y
@@ -160,7 +173,8 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleq %ymm1, %ymm0, %k1
 ; CHECK-NEXT:    vpcmpleq %ymm2, %ymm3, %k1 {%k1}
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <4 x i64> %x1, %y1
   %mask0 = icmp sle <4 x i64> %x, %y
@@ -174,7 +188,8 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtq %ymm2, %ymm1, %k1
 ; CHECK-NEXT:    vpcmpgtq (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sgt <4 x i64> %x1, %y1
   %y = load <4 x i64>, <4 x i64>* %y.ptr, align 4
@@ -189,7 +204,8 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vpcmpleud (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <8 x i32> %x1, %y1
   %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
@@ -203,7 +219,8 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind
 ; CHECK-LABEL: test256_13:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqq (%rdi){1to4}, %ymm0, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %yb = load i64, i64* %yb.ptr, align 4
   %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
@@ -217,7 +234,8 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind
 ; CHECK-LABEL: test256_14:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled (%rdi){1to8}, %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %yb = load i32, i32* %yb.ptr, align 4
   %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
@@ -232,7 +250,8 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <8 x i32> %x1, %y1
   %yb = load i32, i32* %yb.ptr, align 4
@@ -249,7 +268,8 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleq %ymm1, %ymm2, %k1
 ; CHECK-NEXT:    vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <4 x i64> %x1, %y1
   %yb = load i64, i64* %yb.ptr, align 4
@@ -265,7 +285,8 @@ define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
 ; CHECK-LABEL: test256_17:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpneqd (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %yp, align 4
   %mask = icmp ne <8 x i32> %x, %y
@@ -277,7 +298,8 @@ define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
 ; CHECK-LABEL: test256_18:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpneqd (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %yp, align 4
   %mask = icmp ne <8 x i32> %y, %x
@@ -289,7 +311,8 @@ define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
 ; CHECK-LABEL: test256_19:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnltud (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %yp, align 4
   %mask = icmp uge <8 x i32> %x, %y
@@ -301,7 +324,8 @@ define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
 ; CHECK-LABEL: test256_20:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %y = load <8 x i32>, <8 x i32>* %yp, align 4
   %mask = icmp uge <8 x i32> %y, %x
@@ -313,7 +337,8 @@ define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-LABEL: test128_1:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask = icmp eq <2 x i64> %x, %y
   %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
@@ -324,7 +349,8 @@ define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind
 ; CHECK-LABEL: test128_2:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask = icmp sgt <2 x i64> %x, %y
   %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
@@ -335,7 +361,8 @@ define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind
 ; CHECK-LABEL: test128_3:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled %xmm0, %xmm1, %k1
-; CHECK-NEXT:    vpblendmd %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask = icmp sge <4 x i32> %x, %y
   %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y
@@ -346,7 +373,8 @@ define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind
 ; CHECK-LABEL: test128_4:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask = icmp ugt <2 x i64> %x, %y
   %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
@@ -357,7 +385,8 @@ define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwin
 ; CHECK-LABEL: test128_5:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqd (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %yp, align 4
   %mask = icmp eq <4 x i32> %x, %y
@@ -369,7 +398,8 @@ define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwi
 ; CHECK-LABEL: test128_5b:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqd (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %yp, align 4
   %mask = icmp eq <4 x i32> %y, %x
@@ -381,7 +411,8 @@ define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
 ; CHECK-LABEL: test128_6:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtd (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
   %mask = icmp sgt <4 x i32> %x, %y
@@ -393,7 +424,8 @@ define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test128_6b:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtd (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
   %mask = icmp slt <4 x i32> %y, %x
@@ -405,7 +437,8 @@ define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
 ; CHECK-LABEL: test128_7:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
   %mask = icmp sle <4 x i32> %x, %y
@@ -417,7 +450,8 @@ define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test128_7b:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
   %mask = icmp sge <4 x i32> %y, %x
@@ -429,7 +463,8 @@ define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
 ; CHECK-LABEL: test128_8:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
   %mask = icmp ule <4 x i32> %x, %y
@@ -441,7 +476,8 @@ define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test128_8b:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
   %mask = icmp uge <4 x i32> %y, %x
@@ -454,7 +490,8 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32>
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
 ; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 {%k1}
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp eq <4 x i32> %x1, %y1
   %mask0 = icmp eq <4 x i32> %x, %y
@@ -468,7 +505,8 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleq %xmm1, %xmm0, %k1
 ; CHECK-NEXT:    vpcmpleq %xmm2, %xmm3, %k1 {%k1}
-; CHECK-NEXT:    vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <2 x i64> %x1, %y1
   %mask0 = icmp sle <2 x i64> %x, %y
@@ -482,7 +520,8 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtq %xmm2, %xmm1, %k1
 ; CHECK-NEXT:    vpcmpgtq (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sgt <2 x i64> %x1, %y1
   %y = load <2 x i64>, <2 x i64>* %y.ptr, align 4
@@ -497,7 +536,8 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled %xmm1, %xmm2, %k1
 ; CHECK-NEXT:    vpcmpleud (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <4 x i32> %x1, %y1
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
@@ -511,7 +551,8 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind
 ; CHECK-LABEL: test128_13:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k1
-; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %yb = load i64, i64* %yb.ptr, align 4
   %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
@@ -525,7 +566,8 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind
 ; CHECK-LABEL: test128_14:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled (%rdi){1to4}, %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %yb = load i32, i32* %yb.ptr, align 4
   %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
@@ -540,7 +582,8 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpled %xmm1, %xmm2, %k1
 ; CHECK-NEXT:    vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <4 x i32> %x1, %y1
   %yb = load i32, i32* %yb.ptr, align 4
@@ -557,7 +600,8 @@ define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleq %xmm1, %xmm2, %k1
 ; CHECK-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1}
-; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %mask1 = icmp sge <2 x i64> %x1, %y1
   %yb = load i64, i64* %yb.ptr, align 4
@@ -573,7 +617,8 @@ define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test128_17:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpneqd (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
   %mask = icmp ne <4 x i32> %x, %y
@@ -585,7 +630,8 @@ define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test128_18:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpneqd (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
   %mask = icmp ne <4 x i32> %y, %x
@@ -597,7 +643,8 @@ define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test128_19:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpnltud (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
   %mask = icmp uge <4 x i32> %x, %y
@@ -609,7 +656,8 @@ define <4 x i32> @test128_20(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
 ; CHECK-LABEL: test128_20:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
   %mask = icmp uge <4 x i32> %y, %x
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index 8e9bc8b5af4b..0060539c691f 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -157,16 +157,12 @@ define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
   ret i8 %d
 }
 
-; FIXME: The 'not' is redundant.
-
 define i32 @smin(i32 %x) {
 ; CHECK-LABEL: smin:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    notl %ecx
 ; CHECK-NEXT:    xorl $-1, %edi
 ; CHECK-NEXT:    movl $-1, %eax
-; CHECK-NEXT:    cmovsl %ecx, %eax
+; CHECK-NEXT:    cmovsl %edi, %eax
 ; CHECK-NEXT:    retq
   %not_x = xor i32 %x, -1
   %1 = icmp slt i32 %not_x, -1
diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index 5636a5bcd73e..5329f5b216a4 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -222,9 +222,9 @@ define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i
 ; SKX-NEXT:    kmovw %edi, %k1
 ; SKX-NEXT:    vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm3
 ; SKX-NEXT:    vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
-; SKX-NEXT:    vblendmps %zmm1, %zmm3, %zmm1 {%k1}
-; SKX-NEXT:    vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm1 {%k1}
-; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    vmovaps %zmm1, %zmm3 {%k1}
+; SKX-NEXT:    vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm3 {%k1}
+; SKX-NEXT:    vmovaps %zmm3, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test15:
@@ -232,9 +232,9 @@ define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm3
 ; KNL-NEXT:    vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
-; KNL-NEXT:    vblendmps %zmm1, %zmm3, %zmm1 {%k1}
-; KNL-NEXT:    vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm1 {%k1}
-; KNL-NEXT:    vmovaps %zmm1, %zmm0
+; KNL-NEXT:    vmovaps %zmm1, %zmm3 {%k1}
+; KNL-NEXT:    vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm3 {%k1}
+; KNL-NEXT:    vmovaps %zmm3, %zmm0
 ; KNL-NEXT:    retq
 entry:
   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
diff --git a/test/CodeGen/X86/fmaddsub-combine.ll b/test/CodeGen/X86/fmaddsub-combine.ll
new file mode 100644
index 000000000000..f3b13cd053b4
--- /dev/null
+++ b/test/CodeGen/X86/fmaddsub-combine.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_256 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_512 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck -check-prefix=FMA4 %s
+
+; This test checks the fusing of MUL + ADDSUB to FMADDSUB.
+
+define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B,  <2 x double> %C) #0 {
+; FMA3-LABEL: mul_addsub_pd128:
+; FMA3:       # BB#0: # %entry
+; FMA3-NEXT:  vfmaddsub213pd  %xmm2, %xmm1, %xmm0
+; FMA3-NEXT:  retq
+;
+; FMA4-LABEL: mul_addsub_pd128:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:  vfmaddsubpd     %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:  retq
+entry:
+  %AB = fmul <2 x double> %A, %B
+  %Sub = fsub <2 x double> %AB, %C
+  %Add = fadd <2 x double> %AB, %C
+  %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %Addsub
+}
+
+define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
+; FMA3-LABEL: mul_addsub_ps128:
+; FMA3:       # BB#0: # %entry
+; FMA3-NEXT:  vfmaddsub213ps  %xmm2, %xmm1, %xmm0
+; FMA3-NEXT:  retq
+;
+; FMA4-LABEL: mul_addsub_ps128:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:  vfmaddsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:  retq
+entry:
+  %AB = fmul <4 x float> %A, %B
+  %Sub = fsub <4 x float> %AB, %C
+  %Add = fadd <4 x float> %AB, %C
+  %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %Addsub
+}
+
+define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
+; FMA3-LABEL: mul_addsub_pd256:
+; FMA3:       # BB#0: # %entry
+; FMA3-NEXT:  vfmaddsub213pd  %ymm2, %ymm1, %ymm0
+; FMA3-NEXT:  retq
+;
+; FMA4-LABEL: mul_addsub_pd256:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:  vfmaddsubpd     %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:  retq
+entry:
+  %AB = fmul <4 x double> %A, %B
+  %Sub = fsub <4 x double> %AB, %C
+  %Add = fadd <4 x double> %AB, %C
+  %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %Addsub
+}
+
+define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
+; FMA3-LABEL: mul_addsub_ps256:
+; FMA3:       # BB#0: # %entry
+; FMA3-NEXT:  vfmaddsub213ps  %ymm2, %ymm1, %ymm0
+; FMA3-NEXT:  retq
+;
+; FMA4-LABEL: mul_addsub_ps256:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:  vfmaddsubps     %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:  retq
+entry:
+  %AB = fmul <8 x float> %A, %B
+  %Sub = fsub <8 x float> %AB, %C
+  %Add = fadd <8 x float> %AB, %C
+  %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x float> %Addsub
+}
+
+define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
+; FMA3_256-LABEL: mul_addsub_pd512:
+; FMA3_256:       # BB#0: # %entry
+; FMA3_256-NEXT:  vfmaddsub213pd  %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT:  vfmaddsub213pd  %ymm5, %ymm3, %ymm1
+; FMA3_256-NEXT:  retq
+;
+; FMA3_512-LABEL: mul_addsub_pd512:
+; FMA3_512:       # BB#0: # %entry
+; FMA3_512-NEXT:  vfmaddsub213pd  %zmm2, %zmm1, %zmm0
+; FMA3_512-NEXT:  retq
+;
+; FMA4-LABEL: mul_addsub_pd512:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:  vfmaddsubpd     %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:  vfmaddsubpd     %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:  retq
+entry:
+  %AB = fmul <8 x double> %A, %B
+  %Sub = fsub <8 x double> %AB, %C
+  %Add = fadd <8 x double> %AB, %C
+  %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x double> %Addsub
+}
+
+define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
+; FMA3_256-LABEL: mul_addsub_ps512:
+; FMA3_256:       # BB#0: # %entry
+; FMA3_256-NEXT:  vfmaddsub213ps  %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT:  vfmaddsub213ps  %ymm5, %ymm3, %ymm1
+; FMA3_256-NEXT:  retq
+;
+; FMA3_512-LABEL: mul_addsub_ps512:
+; FMA3_512:       # BB#0: # %entry
+; FMA3_512-NEXT:  vfmaddsub213ps  %zmm2, %zmm1, %zmm0
+; FMA3_512-NEXT:  retq
+;
+; FMA4-LABEL: mul_addsub_ps512:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:  vfmaddsubps     %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:  vfmaddsubps     %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:  retq
+entry:
+  %AB = fmul <16 x float> %A, %B
+  %Sub = fsub <16 x float> %AB, %C
+  %Add = fadd <16 x float> %AB, %C
+  %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x float> %Addsub
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/sse-fsignum.ll b/test/CodeGen/X86/sse-fsignum.ll
index 7159d4c87174..32594a27698d 100644
--- a/test/CodeGen/X86/sse-fsignum.ll
+++ b/test/CodeGen/X86/sse-fsignum.ll
@@ -93,15 +93,14 @@ define void @signum32b(<8 x float>*) {
 ; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512F-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vcmpltps %zmm1, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1} {z}
-; AVX512F-NEXT:    vpmovqd %zmm3, %ymm3
-; AVX512F-NEXT:    vcvtdq2ps %ymm3, %ymm3
+; AVX512F-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vpmovqd %zmm2, %ymm2
+; AVX512F-NEXT:    vcvtdq2ps %ymm2, %ymm2
 ; AVX512F-NEXT:    vcmpltps %zmm0, %zmm1, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512F-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512F-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vsubps %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    vmovaps %ymm0, (%rdi)
 ; AVX512F-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll
index abe3da752874..c34f333ef785 100644
--- a/test/CodeGen/X86/vector-compare-results.ll
+++ b/test/CodeGen/X86/vector-compare-results.ll
@@ -4,6 +4,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; 128-bit vector comparisons
@@ -308,12 +310,26 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v16i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v16i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v16i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    retq
   %1 = icmp sgt <16 x i16> %a0, %a1
   ret <16 x i1> %1
 }
@@ -589,13 +605,26 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v8f64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v8f64:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v8f64:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v8f64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    retq
   %1 = fcmp ogt <8 x double> %a0, %a1
   ret <8 x i1> %1
 }
@@ -636,13 +665,26 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v16f32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vcmpltps %zmm0, %zmm1, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v16f32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vcmpltps %zmm0, %zmm1, %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v16f32:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v16f32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vcmpltps %zmm0, %zmm1, %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    retq
   %1 = fcmp ogt <16 x float> %a0, %a1
   ret <16 x i1> %1
 }
@@ -734,13 +776,26 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v8i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v8i64:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v8i64:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v8i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    retq
   %1 = icmp sgt <8 x i64> %a0, %a1
   ret <8 x i1> %1
 }
@@ -784,13 +839,26 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v16i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v16i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v16i32:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v16i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    retq
   %1 = icmp sgt <16 x i32> %a0, %a1
   ret <16 x i1> %1
 }
@@ -1045,16 +1113,35 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v32i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
-; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v32i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v32i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %1 = icmp sgt <32 x i16> %a0, %a1
   ret <32 x i1> %1
 }
@@ -1874,15 +1961,31 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v64i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm4
-; AVX512-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm3
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512-NEXT:    vmovdqa %xmm4, %xmm2
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v64i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm4
+; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm4, %xmm3
+; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vmovdqa %xmm4, %xmm2
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v64i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm3
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm2
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    retq
   %1 = icmp sgt <64 x i8> %a0, %a1
   ret <64 x i1> %1
 }
@@ -1957,120 +2060,350 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v16f64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
-; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm5
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512-NEXT:    movq $-1, %rcx
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm6
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
-; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm6
-; AVX512-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm7
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512-NEXT:    vextractf32x4 $1, %zmm2, %xmm5
-; AVX512-NEXT:    vextractf32x4 $1, %zmm0, %xmm6
-; AVX512-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm7
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512-NEXT:    vucomisd %xmm2, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm6
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512-NEXT:    vucomisd %xmm2, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
-; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
-; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm5
-; AVX512-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm6
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512-NEXT:    vextractf32x4 $1, %zmm3, %xmm4
-; AVX512-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
-; AVX512-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm6
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512-NEXT:    vucomisd %xmm3, %xmm1
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512-NEXT:    vucomisd %xmm3, %xmm1
-; AVX512-NEXT:    cmovaq %rcx, %rax
-; AVX512-NEXT:    vmovq %rax, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v16f64:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm0, %xmm5
+; AVX512F-NEXT:    xorl %eax, %eax
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512F-NEXT:    movq $-1, %rcx
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm6
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm6
+; AVX512F-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm7
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512F-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm2, %xmm5
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm0, %xmm6
+; AVX512F-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm7
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512F-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512F-NEXT:    vucomisd %xmm2, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm6
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vucomisd %xmm2, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm1, %xmm4
+; AVX512F-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm1, %xmm5
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm6
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm3, %xmm4
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm6
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512F-NEXT:    vucomisd %xmm3, %xmm1
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512F-NEXT:    vucomisd %xmm3, %xmm1
+; AVX512F-NEXT:    cmovaq %rcx, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v16f64:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm0, %xmm5
+; AVX512DQ-NEXT:    xorl %eax, %eax
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT:    movq $-1, %rcx
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm0, %xmm6
+; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm7
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm0, %xmm6
+; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm7
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm1, %xmm4
+; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm1, %xmm5
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm1, %xmm5
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512DQ-NEXT:    vucomisd %xmm3, %xmm1
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm3, %xmm1
+; AVX512DQ-NEXT:    cmovaq %rcx, %rax
+; AVX512DQ-NEXT:    vmovq %rax, %xmm1
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v16f64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm0, %xmm5
+; AVX512BW-NEXT:    xorl %eax, %eax
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT:    movq $-1, %rcx
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm6
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, %xmm6
+; AVX512BW-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm7
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm2, %xmm5
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm0, %xmm6
+; AVX512BW-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm7
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512BW-NEXT:    vucomisd %xmm2, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm6
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm2, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm1, %xmm4
+; AVX512BW-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm1, %xmm5
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm6
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm3, %xmm4
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm6
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512BW-NEXT:    vucomisd %xmm3, %xmm1
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm3, %xmm1
+; AVX512BW-NEXT:    cmovaq %rcx, %rax
+; AVX512BW-NEXT:    vmovq %rax, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    retq
   %1 = fcmp ogt <16 x double> %a0, %a1
   ret <16 x i1> %1
 }
@@ -2416,207 +2749,612 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v32f32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm6
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    vucomiss %xmm5, %xmm7
-; AVX512-NEXT:    movl $-1, %ecx
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vucomiss %xmm4, %xmm6
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmoval %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm5
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm8
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512-NEXT:    vucomiss %xmm7, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm8, %xmm5
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512-NEXT:    vucomiss %xmm4, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm8
-; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm7
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; AVX512-NEXT:    vucomiss %xmm6, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vucomiss %xmm5, %xmm7
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmoval %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm4
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm9
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
-; AVX512-NEXT:    vucomiss %xmm6, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm9, %xmm4
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
-; AVX512-NEXT:    vucomiss %xmm5, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm4
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm4, %ymm8
-; AVX512-NEXT:    vextractf32x4 $1, %zmm2, %xmm5
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512-NEXT:    vextractf32x4 $1, %zmm0, %xmm7
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; AVX512-NEXT:    vucomiss %xmm6, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vucomiss %xmm5, %xmm7
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmoval %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm4
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm9
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
-; AVX512-NEXT:    vucomiss %xmm6, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm9, %xmm4
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
-; AVX512-NEXT:    vucomiss %xmm5, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm4
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vucomiss %xmm5, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vucomiss %xmm2, %xmm0
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmoval %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm5
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
-; AVX512-NEXT:    vucomiss %xmm6, %xmm7
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT:    vucomiss %xmm2, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm8
-; AVX512-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm5
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512-NEXT:    vucomiss %xmm4, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vucomiss %xmm2, %xmm5
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmoval %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm4
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512-NEXT:    vucomiss %xmm6, %xmm7
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512-NEXT:    vucomiss %xmm2, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm2
-; AVX512-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm6
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX512-NEXT:    vucomiss %xmm5, %xmm7
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vucomiss %xmm4, %xmm6
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmoval %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm5
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
-; AVX512-NEXT:    vucomiss %xmm7, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm0
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
-; AVX512-NEXT:    vucomiss %xmm4, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vextractf32x4 $1, %zmm3, %xmm0
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512-NEXT:    vucomiss %xmm4, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vucomiss %xmm0, %xmm5
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmoval %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm4
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512-NEXT:    vucomiss %xmm6, %xmm7
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512-NEXT:    vucomiss %xmm0, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm0
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX512-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; AVX512-NEXT:    vucomiss %xmm4, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vucomiss %xmm3, %xmm1
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmoval %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm4
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
-; AVX512-NEXT:    vucomiss %xmm5, %xmm6
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmoval %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-NEXT:    vucomiss %xmm3, %xmm1
-; AVX512-NEXT:    cmoval %ecx, %eax
-; AVX512-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v32f32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm0, %xmm6
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512F-NEXT:    xorl %eax, %eax
+; AVX512F-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512F-NEXT:    movl $-1, %ecx
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmoval %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm5
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm8
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512F-NEXT:    vucomiss %xmm7, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm8, %xmm5
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX512F-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm8
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm7
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512F-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmoval %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm4
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512F-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512F-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512F-NEXT:    vinserti128 $1, %xmm8, %ymm4, %ymm8
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm2, %xmm5
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm0, %xmm7
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512F-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmoval %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm4
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512F-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512F-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; AVX512F-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vucomiss %xmm2, %xmm0
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmoval %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm5
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
+; AVX512F-NEXT:    vucomiss %xmm6, %xmm7
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT:    vucomiss %xmm2, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm8
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm1, %xmm5
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512F-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vucomiss %xmm2, %xmm5
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmoval %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm4
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512F-NEXT:    vucomiss %xmm6, %xmm7
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512F-NEXT:    vucomiss %xmm2, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm2
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm1, %xmm6
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512F-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmoval %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm5
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
+; AVX512F-NEXT:    vucomiss %xmm7, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm0
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; AVX512F-NEXT:    vucomiss %xmm4, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm3, %xmm0
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512F-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmoval %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm4
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512F-NEXT:    vucomiss %xmm6, %xmm7
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512F-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm0
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX512F-NEXT:    vucomiss %xmm4, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vucomiss %xmm3, %xmm1
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmoval %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm4
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
+; AVX512F-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmoval %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512F-NEXT:    vucomiss %xmm3, %xmm1
+; AVX512F-NEXT:    cmoval %ecx, %eax
+; AVX512F-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v32f32:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm0, %xmm6
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512DQ-NEXT:    xorl %eax, %eax
+; AVX512DQ-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512DQ-NEXT:    movl $-1, %ecx
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmoval %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm5
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm8
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512DQ-NEXT:    vucomiss %xmm7, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm8, %xmm5
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX512DQ-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm8
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm0, %xmm7
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512DQ-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmoval %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm4
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512DQ-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512DQ-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm4, %ymm8
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm0, %xmm7
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512DQ-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmoval %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm4
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512DQ-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512DQ-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; AVX512DQ-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vucomiss %xmm2, %xmm0
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmoval %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm5
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
+; AVX512DQ-NEXT:    vucomiss %xmm6, %xmm7
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512DQ-NEXT:    vucomiss %xmm2, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm0
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm8
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm1, %xmm5
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512DQ-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vucomiss %xmm2, %xmm5
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmoval %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm4
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512DQ-NEXT:    vucomiss %xmm6, %xmm7
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512DQ-NEXT:    vucomiss %xmm2, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm2
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm1, %xmm6
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512DQ-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmoval %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm5
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
+; AVX512DQ-NEXT:    vucomiss %xmm7, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm0
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; AVX512DQ-NEXT:    vucomiss %xmm4, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm3, %xmm0
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512DQ-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmoval %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm4
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512DQ-NEXT:    vucomiss %xmm6, %xmm7
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512DQ-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm0
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX512DQ-NEXT:    vucomiss %xmm4, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vucomiss %xmm3, %xmm1
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmoval %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm4
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
+; AVX512DQ-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmoval %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512DQ-NEXT:    vucomiss %xmm3, %xmm1
+; AVX512DQ-NEXT:    cmoval %ecx, %eax
+; AVX512DQ-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v32f32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm0, %xmm6
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512BW-NEXT:    xorl %eax, %eax
+; AVX512BW-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512BW-NEXT:    movl $-1, %ecx
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm5
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm8
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm7, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm8, %xmm5
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm8
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, %xmm7
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512BW-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm4
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm4, %ymm8
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm2, %xmm5
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm0, %xmm7
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512BW-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm4
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm6, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; AVX512BW-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vucomiss %xmm2, %xmm0
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm5
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm6, %xmm7
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm2, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm8
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm1, %xmm5
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vucomiss %xmm2, %xmm5
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm4
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm6, %xmm7
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm2, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm2
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm1, %xmm6
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512BW-NEXT:    vucomiss %xmm5, %xmm7
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm5
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm7, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm0
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm3, %xmm0
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm4
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm6, %xmm7
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm0
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vucomiss %xmm3, %xmm1
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm4
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm5, %xmm6
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm3, %xmm1
+; AVX512BW-NEXT:    cmoval %ecx, %eax
+; AVX512BW-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %1 = fcmp ogt <32 x float> %a0, %a1
   ret <32 x i1> %1
 }
@@ -2785,136 +3523,398 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v16i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rcx
-; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
-; AVX512-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    cmpq %rcx, %rdx
-; AVX512-NEXT:    movq $-1, %rcx
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm6
-; AVX512-NEXT:    vmovq %xmm4, %rdx
-; AVX512-NEXT:    vmovq %xmm5, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
-; AVX512-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
-; AVX512-NEXT:    vpextrq $1, %xmm6, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm7
-; AVX512-NEXT:    vmovq %xmm5, %rdx
-; AVX512-NEXT:    vmovq %xmm6, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
-; AVX512-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; AVX512-NEXT:    vpextrq $1, %xmm6, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm7
-; AVX512-NEXT:    vmovq %xmm5, %rdx
-; AVX512-NEXT:    vmovq %xmm6, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm6
-; AVX512-NEXT:    vmovq %xmm2, %rdx
-; AVX512-NEXT:    vmovq %xmm0, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vmovq %xmm2, %rdx
-; AVX512-NEXT:    vmovq %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; AVX512-NEXT:    vpextrq $1, %xmm5, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm6
-; AVX512-NEXT:    vmovq %xmm4, %rdx
-; AVX512-NEXT:    vmovq %xmm5, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
-; AVX512-NEXT:    vpextrq $1, %xmm5, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm6
-; AVX512-NEXT:    vmovq %xmm4, %rdx
-; AVX512-NEXT:    vmovq %xmm5, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vmovq %xmm3, %rdx
-; AVX512-NEXT:    vmovq %xmm1, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    cmovgq %rcx, %rax
-; AVX512-NEXT:    vmovq %rax, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v16i64:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rcx
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
+; AVX512F-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512F-NEXT:    xorl %eax, %eax
+; AVX512F-NEXT:    cmpq %rcx, %rdx
+; AVX512F-NEXT:    movq $-1, %rcx
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm6
+; AVX512F-NEXT:    vmovq %xmm4, %rdx
+; AVX512F-NEXT:    vmovq %xmm5, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
+; AVX512F-NEXT:    vpextrq $1, %xmm6, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm7
+; AVX512F-NEXT:    vmovq %xmm5, %rdx
+; AVX512F-NEXT:    vmovq %xmm6, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
+; AVX512F-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
+; AVX512F-NEXT:    vpextrq $1, %xmm6, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm7
+; AVX512F-NEXT:    vmovq %xmm5, %rdx
+; AVX512F-NEXT:    vmovq %xmm6, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm6
+; AVX512F-NEXT:    vmovq %xmm2, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vmovq %xmm2, %rdx
+; AVX512F-NEXT:    vmovq %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; AVX512F-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm6
+; AVX512F-NEXT:    vmovq %xmm4, %rdx
+; AVX512F-NEXT:    vmovq %xmm5, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
+; AVX512F-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm6
+; AVX512F-NEXT:    vmovq %xmm4, %rdx
+; AVX512F-NEXT:    vmovq %xmm5, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm3, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vmovq %xmm3, %rdx
+; AVX512F-NEXT:    vmovq %xmm1, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    cmovgq %rcx, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v16i64:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rcx
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm0, %xmm5
+; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512DQ-NEXT:    xorl %eax, %eax
+; AVX512DQ-NEXT:    cmpq %rcx, %rdx
+; AVX512DQ-NEXT:    movq $-1, %rcx
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
+; AVX512DQ-NEXT:    vmovq %xmm4, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm5, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm0, %xmm6
+; AVX512DQ-NEXT:    vpextrq $1, %xmm6, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm7
+; AVX512DQ-NEXT:    vmovq %xmm5, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm6, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm0, %xmm6
+; AVX512DQ-NEXT:    vpextrq $1, %xmm6, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm7
+; AVX512DQ-NEXT:    vmovq %xmm5, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm6, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512DQ-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
+; AVX512DQ-NEXT:    vmovq %xmm2, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm0, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm1, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vmovq %xmm2, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm1, %xmm5
+; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
+; AVX512DQ-NEXT:    vmovq %xmm4, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm5, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm1, %xmm5
+; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
+; AVX512DQ-NEXT:    vmovq %xmm4, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm5, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512DQ-NEXT:    vpextrq $1, %xmm3, %rdx
+; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vmovq %xmm3, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm1, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    cmovgq %rcx, %rax
+; AVX512DQ-NEXT:    vmovq %rax, %xmm1
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v16i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rcx
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
+; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512BW-NEXT:    xorl %eax, %eax
+; AVX512BW-NEXT:    cmpq %rcx, %rdx
+; AVX512BW-NEXT:    movq $-1, %rcx
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm6
+; AVX512BW-NEXT:    vmovq %xmm4, %rdx
+; AVX512BW-NEXT:    vmovq %xmm5, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
+; AVX512BW-NEXT:    vpextrq $1, %xmm6, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm7
+; AVX512BW-NEXT:    vmovq %xmm5, %rdx
+; AVX512BW-NEXT:    vmovq %xmm6, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
+; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
+; AVX512BW-NEXT:    vpextrq $1, %xmm6, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm7
+; AVX512BW-NEXT:    vmovq %xmm5, %rdx
+; AVX512BW-NEXT:    vmovq %xmm6, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm6
+; AVX512BW-NEXT:    vmovq %xmm2, %rdx
+; AVX512BW-NEXT:    vmovq %xmm0, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vmovq %xmm2, %rdx
+; AVX512BW-NEXT:    vmovq %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm6
+; AVX512BW-NEXT:    vmovq %xmm4, %rdx
+; AVX512BW-NEXT:    vmovq %xmm5, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
+; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm6
+; AVX512BW-NEXT:    vmovq %xmm4, %rdx
+; AVX512BW-NEXT:    vmovq %xmm5, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rdx
+; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vmovq %xmm3, %rdx
+; AVX512BW-NEXT:    vmovq %xmm1, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    cmovgq %rcx, %rax
+; AVX512BW-NEXT:    vmovq %rax, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    retq
   %1 = icmp sgt <16 x i64> %a0, %a1
   ret <16 x i1> %1
 }
@@ -3252,223 +4252,660 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v32i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
-; AVX512-NEXT:    vpextrd $1, %xmm4, %ecx
-; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
-; AVX512-NEXT:    vpextrd $1, %xmm5, %edx
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    cmpl %ecx, %edx
-; AVX512-NEXT:    movl $-1, %ecx
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vmovd %xmm4, %esi
-; AVX512-NEXT:    vmovd %xmm5, %edi
-; AVX512-NEXT:    cmpl %esi, %edi
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmovgl %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm6
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512-NEXT:    vpextrd $2, %xmm4, %edx
-; AVX512-NEXT:    vpextrd $2, %xmm5, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512-NEXT:    vpextrd $3, %xmm4, %edx
-; AVX512-NEXT:    vpextrd $3, %xmm5, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
-; AVX512-NEXT:    vpextrd $1, %xmm5, %edx
-; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
-; AVX512-NEXT:    vpextrd $1, %xmm6, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vmovd %xmm5, %esi
-; AVX512-NEXT:    vmovd %xmm6, %edi
-; AVX512-NEXT:    cmpl %esi, %edi
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmovgl %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm7
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm7, %xmm7
-; AVX512-NEXT:    vpextrd $2, %xmm5, %edx
-; AVX512-NEXT:    vpextrd $2, %xmm6, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm7, %xmm7
-; AVX512-NEXT:    vpextrd $3, %xmm5, %edx
-; AVX512-NEXT:    vpextrd $3, %xmm6, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm7, %xmm5
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
-; AVX512-NEXT:    vpextrd $1, %xmm5, %edx
-; AVX512-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; AVX512-NEXT:    vpextrd $1, %xmm6, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vmovd %xmm5, %esi
-; AVX512-NEXT:    vmovd %xmm6, %edi
-; AVX512-NEXT:    cmpl %esi, %edi
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmovgl %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm7
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm7, %xmm7
-; AVX512-NEXT:    vpextrd $2, %xmm5, %edx
-; AVX512-NEXT:    vpextrd $2, %xmm6, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm7, %xmm7
-; AVX512-NEXT:    vpextrd $3, %xmm5, %edx
-; AVX512-NEXT:    vpextrd $3, %xmm6, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm7, %xmm5
-; AVX512-NEXT:    vpextrd $1, %xmm2, %edx
-; AVX512-NEXT:    vpextrd $1, %xmm0, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vmovd %xmm2, %esi
-; AVX512-NEXT:    vmovd %xmm0, %edi
-; AVX512-NEXT:    cmpl %esi, %edi
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmovgl %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm6
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512-NEXT:    vpextrd $2, %xmm2, %edx
-; AVX512-NEXT:    vpextrd $2, %xmm0, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512-NEXT:    vpextrd $3, %xmm2, %edx
-; AVX512-NEXT:    vpextrd $3, %xmm0, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
-; AVX512-NEXT:    vpextrd $1, %xmm2, %edx
-; AVX512-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT:    vpextrd $1, %xmm4, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vmovd %xmm2, %esi
-; AVX512-NEXT:    vmovd %xmm4, %edi
-; AVX512-NEXT:    cmpl %esi, %edi
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmovgl %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm5
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512-NEXT:    vpextrd $2, %xmm2, %edx
-; AVX512-NEXT:    vpextrd $2, %xmm4, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512-NEXT:    vpextrd $3, %xmm2, %edx
-; AVX512-NEXT:    vpextrd $3, %xmm4, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm2
-; AVX512-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
-; AVX512-NEXT:    vpextrd $1, %xmm4, %edx
-; AVX512-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; AVX512-NEXT:    vpextrd $1, %xmm5, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vmovd %xmm4, %esi
-; AVX512-NEXT:    vmovd %xmm5, %edi
-; AVX512-NEXT:    cmpl %esi, %edi
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmovgl %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm6
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512-NEXT:    vpextrd $2, %xmm4, %edx
-; AVX512-NEXT:    vpextrd $2, %xmm5, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512-NEXT:    vpextrd $3, %xmm4, %edx
-; AVX512-NEXT:    vpextrd $3, %xmm5, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
-; AVX512-NEXT:    vpextrd $1, %xmm4, %edx
-; AVX512-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
-; AVX512-NEXT:    vpextrd $1, %xmm5, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vmovd %xmm4, %esi
-; AVX512-NEXT:    vmovd %xmm5, %edi
-; AVX512-NEXT:    cmpl %esi, %edi
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmovgl %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm6
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512-NEXT:    vpextrd $2, %xmm4, %edx
-; AVX512-NEXT:    vpextrd $2, %xmm5, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512-NEXT:    vpextrd $3, %xmm4, %edx
-; AVX512-NEXT:    vpextrd $3, %xmm5, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512-NEXT:    vpextrd $1, %xmm3, %edx
-; AVX512-NEXT:    vpextrd $1, %xmm1, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vmovd %xmm3, %esi
-; AVX512-NEXT:    vmovd %xmm1, %edi
-; AVX512-NEXT:    cmpl %esi, %edi
-; AVX512-NEXT:    movl $0, %esi
-; AVX512-NEXT:    cmovgl %ecx, %esi
-; AVX512-NEXT:    vmovd %esi, %xmm5
-; AVX512-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512-NEXT:    vpextrd $2, %xmm3, %edx
-; AVX512-NEXT:    vpextrd $2, %xmm1, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgl %ecx, %edx
-; AVX512-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512-NEXT:    vpextrd $3, %xmm3, %edx
-; AVX512-NEXT:    vpextrd $3, %xmm1, %esi
-; AVX512-NEXT:    cmpl %edx, %esi
-; AVX512-NEXT:    cmovgl %ecx, %eax
-; AVX512-NEXT:    vpinsrd $3, %eax, %xmm5, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v32i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
+; AVX512F-NEXT:    vpextrd $1, %xmm4, %ecx
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
+; AVX512F-NEXT:    vpextrd $1, %xmm5, %edx
+; AVX512F-NEXT:    xorl %eax, %eax
+; AVX512F-NEXT:    cmpl %ecx, %edx
+; AVX512F-NEXT:    movl $-1, %ecx
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vmovd %xmm4, %esi
+; AVX512F-NEXT:    vmovd %xmm5, %edi
+; AVX512F-NEXT:    cmpl %esi, %edi
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmovgl %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm6
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512F-NEXT:    vpextrd $2, %xmm4, %edx
+; AVX512F-NEXT:    vpextrd $2, %xmm5, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512F-NEXT:    vpextrd $3, %xmm4, %edx
+; AVX512F-NEXT:    vpextrd $3, %xmm5, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT:    vpextrd $1, %xmm5, %edx
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
+; AVX512F-NEXT:    vpextrd $1, %xmm6, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vmovd %xmm5, %esi
+; AVX512F-NEXT:    vmovd %xmm6, %edi
+; AVX512F-NEXT:    cmpl %esi, %edi
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmovgl %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm7
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512F-NEXT:    vpextrd $2, %xmm5, %edx
+; AVX512F-NEXT:    vpextrd $2, %xmm6, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512F-NEXT:    vpextrd $3, %xmm5, %edx
+; AVX512F-NEXT:    vpextrd $3, %xmm6, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
+; AVX512F-NEXT:    vpextrd $1, %xmm5, %edx
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
+; AVX512F-NEXT:    vpextrd $1, %xmm6, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vmovd %xmm5, %esi
+; AVX512F-NEXT:    vmovd %xmm6, %edi
+; AVX512F-NEXT:    cmpl %esi, %edi
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmovgl %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm7
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512F-NEXT:    vpextrd $2, %xmm5, %edx
+; AVX512F-NEXT:    vpextrd $2, %xmm6, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512F-NEXT:    vpextrd $3, %xmm5, %edx
+; AVX512F-NEXT:    vpextrd $3, %xmm6, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512F-NEXT:    vpextrd $1, %xmm2, %edx
+; AVX512F-NEXT:    vpextrd $1, %xmm0, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vmovd %xmm2, %esi
+; AVX512F-NEXT:    vmovd %xmm0, %edi
+; AVX512F-NEXT:    cmpl %esi, %edi
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmovgl %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm6
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512F-NEXT:    vpextrd $2, %xmm2, %edx
+; AVX512F-NEXT:    vpextrd $2, %xmm0, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512F-NEXT:    vpextrd $3, %xmm2, %edx
+; AVX512F-NEXT:    vpextrd $3, %xmm0, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT:    vpextrd $1, %xmm2, %edx
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; AVX512F-NEXT:    vpextrd $1, %xmm4, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vmovd %xmm2, %esi
+; AVX512F-NEXT:    vmovd %xmm4, %edi
+; AVX512F-NEXT:    cmpl %esi, %edi
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmovgl %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm5
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512F-NEXT:    vpextrd $2, %xmm2, %edx
+; AVX512F-NEXT:    vpextrd $2, %xmm4, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512F-NEXT:    vpextrd $3, %xmm2, %edx
+; AVX512F-NEXT:    vpextrd $3, %xmm4, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm2
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
+; AVX512F-NEXT:    vpextrd $1, %xmm4, %edx
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; AVX512F-NEXT:    vpextrd $1, %xmm5, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vmovd %xmm4, %esi
+; AVX512F-NEXT:    vmovd %xmm5, %edi
+; AVX512F-NEXT:    cmpl %esi, %edi
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmovgl %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm6
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512F-NEXT:    vpextrd $2, %xmm4, %edx
+; AVX512F-NEXT:    vpextrd $2, %xmm5, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512F-NEXT:    vpextrd $3, %xmm4, %edx
+; AVX512F-NEXT:    vpextrd $3, %xmm5, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
+; AVX512F-NEXT:    vpextrd $1, %xmm4, %edx
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
+; AVX512F-NEXT:    vpextrd $1, %xmm5, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vmovd %xmm4, %esi
+; AVX512F-NEXT:    vmovd %xmm5, %edi
+; AVX512F-NEXT:    cmpl %esi, %edi
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmovgl %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm6
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512F-NEXT:    vpextrd $2, %xmm4, %edx
+; AVX512F-NEXT:    vpextrd $2, %xmm5, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512F-NEXT:    vpextrd $3, %xmm4, %edx
+; AVX512F-NEXT:    vpextrd $3, %xmm5, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512F-NEXT:    vpextrd $1, %xmm3, %edx
+; AVX512F-NEXT:    vpextrd $1, %xmm1, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vmovd %xmm3, %esi
+; AVX512F-NEXT:    vmovd %xmm1, %edi
+; AVX512F-NEXT:    cmpl %esi, %edi
+; AVX512F-NEXT:    movl $0, %esi
+; AVX512F-NEXT:    cmovgl %ecx, %esi
+; AVX512F-NEXT:    vmovd %esi, %xmm5
+; AVX512F-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512F-NEXT:    vpextrd $2, %xmm3, %edx
+; AVX512F-NEXT:    vpextrd $2, %xmm1, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgl %ecx, %edx
+; AVX512F-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512F-NEXT:    vpextrd $3, %xmm3, %edx
+; AVX512F-NEXT:    vpextrd $3, %xmm1, %esi
+; AVX512F-NEXT:    cmpl %edx, %esi
+; AVX512F-NEXT:    cmovgl %ecx, %eax
+; AVX512F-NEXT:    vpinsrd $3, %eax, %xmm5, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v32i32:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vpextrd $1, %xmm4, %ecx
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
+; AVX512DQ-NEXT:    vpextrd $1, %xmm5, %edx
+; AVX512DQ-NEXT:    xorl %eax, %eax
+; AVX512DQ-NEXT:    cmpl %ecx, %edx
+; AVX512DQ-NEXT:    movl $-1, %ecx
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vmovd %xmm4, %esi
+; AVX512DQ-NEXT:    vmovd %xmm5, %edi
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmovgl %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm6
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpextrd $2, %xmm4, %edx
+; AVX512DQ-NEXT:    vpextrd $2, %xmm5, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpextrd $3, %xmm4, %edx
+; AVX512DQ-NEXT:    vpextrd $3, %xmm5, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vpextrd $1, %xmm5, %edx
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
+; AVX512DQ-NEXT:    vpextrd $1, %xmm6, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vmovd %xmm5, %esi
+; AVX512DQ-NEXT:    vmovd %xmm6, %edi
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmovgl %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm7
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpextrd $2, %xmm5, %edx
+; AVX512DQ-NEXT:    vpextrd $2, %xmm6, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpextrd $3, %xmm5, %edx
+; AVX512DQ-NEXT:    vpextrd $3, %xmm6, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vpextrd $1, %xmm5, %edx
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
+; AVX512DQ-NEXT:    vpextrd $1, %xmm6, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vmovd %xmm5, %esi
+; AVX512DQ-NEXT:    vmovd %xmm6, %edi
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmovgl %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm7
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpextrd $2, %xmm5, %edx
+; AVX512DQ-NEXT:    vpextrd $2, %xmm6, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512DQ-NEXT:    vpextrd $3, %xmm5, %edx
+; AVX512DQ-NEXT:    vpextrd $3, %xmm6, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512DQ-NEXT:    vpextrd $1, %xmm2, %edx
+; AVX512DQ-NEXT:    vpextrd $1, %xmm0, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vmovd %xmm2, %esi
+; AVX512DQ-NEXT:    vmovd %xmm0, %edi
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmovgl %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm6
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpextrd $2, %xmm2, %edx
+; AVX512DQ-NEXT:    vpextrd $2, %xmm0, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpextrd $3, %xmm2, %edx
+; AVX512DQ-NEXT:    vpextrd $3, %xmm0, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm0
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT:    vpextrd $1, %xmm2, %edx
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; AVX512DQ-NEXT:    vpextrd $1, %xmm4, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vmovd %xmm2, %esi
+; AVX512DQ-NEXT:    vmovd %xmm4, %edi
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmovgl %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm5
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vpextrd $2, %xmm2, %edx
+; AVX512DQ-NEXT:    vpextrd $2, %xmm4, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vpextrd $3, %xmm2, %edx
+; AVX512DQ-NEXT:    vpextrd $3, %xmm4, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm2
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vpextrd $1, %xmm4, %edx
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; AVX512DQ-NEXT:    vpextrd $1, %xmm5, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vmovd %xmm4, %esi
+; AVX512DQ-NEXT:    vmovd %xmm5, %edi
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmovgl %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm6
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpextrd $2, %xmm4, %edx
+; AVX512DQ-NEXT:    vpextrd $2, %xmm5, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpextrd $3, %xmm4, %edx
+; AVX512DQ-NEXT:    vpextrd $3, %xmm5, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vpextrd $1, %xmm4, %edx
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
+; AVX512DQ-NEXT:    vpextrd $1, %xmm5, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vmovd %xmm4, %esi
+; AVX512DQ-NEXT:    vmovd %xmm5, %edi
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmovgl %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm6
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpextrd $2, %xmm4, %edx
+; AVX512DQ-NEXT:    vpextrd $2, %xmm5, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT:    vpextrd $3, %xmm4, %edx
+; AVX512DQ-NEXT:    vpextrd $3, %xmm5, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512DQ-NEXT:    vpextrd $1, %xmm3, %edx
+; AVX512DQ-NEXT:    vpextrd $1, %xmm1, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vmovd %xmm3, %esi
+; AVX512DQ-NEXT:    vmovd %xmm1, %edi
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    movl $0, %esi
+; AVX512DQ-NEXT:    cmovgl %ecx, %esi
+; AVX512DQ-NEXT:    vmovd %esi, %xmm5
+; AVX512DQ-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vpextrd $2, %xmm3, %edx
+; AVX512DQ-NEXT:    vpextrd $2, %xmm1, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgl %ecx, %edx
+; AVX512DQ-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT:    vpextrd $3, %xmm3, %edx
+; AVX512DQ-NEXT:    vpextrd $3, %xmm1, %esi
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    cmovgl %ecx, %eax
+; AVX512DQ-NEXT:    vpinsrd $3, %eax, %xmm5, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v32i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT:    vpextrd $1, %xmm4, %ecx
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
+; AVX512BW-NEXT:    vpextrd $1, %xmm5, %edx
+; AVX512BW-NEXT:    xorl %eax, %eax
+; AVX512BW-NEXT:    cmpl %ecx, %edx
+; AVX512BW-NEXT:    movl $-1, %ecx
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vmovd %xmm4, %esi
+; AVX512BW-NEXT:    vmovd %xmm5, %edi
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgl %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm6
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrd $2, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrd $2, %xmm5, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrd $3, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrd $3, %xmm5, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT:    vpextrd $1, %xmm5, %edx
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
+; AVX512BW-NEXT:    vpextrd $1, %xmm6, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vmovd %xmm5, %esi
+; AVX512BW-NEXT:    vmovd %xmm6, %edi
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgl %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm7
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrd $2, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrd $2, %xmm6, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrd $3, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrd $3, %xmm6, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
+; AVX512BW-NEXT:    vpextrd $1, %xmm5, %edx
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
+; AVX512BW-NEXT:    vpextrd $1, %xmm6, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vmovd %xmm5, %esi
+; AVX512BW-NEXT:    vmovd %xmm6, %edi
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgl %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm7
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrd $2, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrd $2, %xmm6, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrd $3, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrd $3, %xmm6, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512BW-NEXT:    vpextrd $1, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrd $1, %xmm0, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vmovd %xmm2, %esi
+; AVX512BW-NEXT:    vmovd %xmm0, %edi
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgl %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm6
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrd $2, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrd $2, %xmm0, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrd $3, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrd $3, %xmm0, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT:    vpextrd $1, %xmm2, %edx
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; AVX512BW-NEXT:    vpextrd $1, %xmm4, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vmovd %xmm2, %esi
+; AVX512BW-NEXT:    vmovd %xmm4, %edi
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgl %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm5
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrd $2, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrd $2, %xmm4, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrd $3, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrd $3, %xmm4, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm2
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT:    vpextrd $1, %xmm4, %edx
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; AVX512BW-NEXT:    vpextrd $1, %xmm5, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vmovd %xmm4, %esi
+; AVX512BW-NEXT:    vmovd %xmm5, %edi
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgl %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm6
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrd $2, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrd $2, %xmm5, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrd $3, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrd $3, %xmm5, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
+; AVX512BW-NEXT:    vpextrd $1, %xmm4, %edx
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
+; AVX512BW-NEXT:    vpextrd $1, %xmm5, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vmovd %xmm4, %esi
+; AVX512BW-NEXT:    vmovd %xmm5, %edi
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgl %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm6
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrd $2, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrd $2, %xmm5, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrd $3, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrd $3, %xmm5, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpextrd $1, %xmm3, %edx
+; AVX512BW-NEXT:    vpextrd $1, %xmm1, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vmovd %xmm3, %esi
+; AVX512BW-NEXT:    vmovd %xmm1, %edi
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgl %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm5
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrd $2, %xmm3, %edx
+; AVX512BW-NEXT:    vpextrd $2, %xmm1, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgl %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrd $3, %xmm3, %edx
+; AVX512BW-NEXT:    vpextrd $3, %xmm1, %esi
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    cmovgl %ecx, %eax
+; AVX512BW-NEXT:    vpinsrd $3, %eax, %xmm5, %xmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %1 = icmp sgt <32 x i32> %a0, %a1
   ret <32 x i1> %1
 }
@@ -4342,291 +5779,987 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v64i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpcmpgtw %ymm7, %ymm3, %ymm3
-; AVX512-NEXT:    vpmovsxwd %ymm3, %zmm3
-; AVX512-NEXT:    vpslld $31, %zmm3, %zmm3
-; AVX512-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; AVX512-NEXT:    kshiftlw $14, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    kshiftlw $15, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %ecx
-; AVX512-NEXT:    vmovd %ecx, %xmm3
-; AVX512-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $13, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $12, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $11, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $10, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $9, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $8, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $7, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $6, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $5, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $4, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $3, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $2, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftlw $1, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512-NEXT:    kmovw %k0, %eax
-; AVX512-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512-NEXT:    vpcmpgtw %ymm6, %ymm2, %ymm2
-; AVX512-NEXT:    vpmovsxwd %ymm2, %zmm2
-; AVX512-NEXT:    vpslld $31, %zmm2, %zmm2
-; AVX512-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512-NEXT:    kshiftlw $14, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    kshiftlw $15, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %ecx
-; AVX512-NEXT:    vmovd %ecx, %xmm2
-; AVX512-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $13, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $12, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $11, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $10, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $9, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $8, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $7, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $6, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $5, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $4, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $3, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $2, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftlw $1, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512-NEXT:    kmovw %k0, %eax
-; AVX512-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-NEXT:    vpsllw $7, %ymm2, %ymm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; AVX512-NEXT:    vpxor %ymm6, %ymm6, %ymm6
-; AVX512-NEXT:    vpcmpgtb %ymm2, %ymm6, %ymm2
-; AVX512-NEXT:    vpcmpgtw %ymm5, %ymm1, %ymm1
-; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512-NEXT:    vpslld $31, %zmm1, %zmm1
-; AVX512-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512-NEXT:    kshiftlw $14, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    kshiftlw $15, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %ecx
-; AVX512-NEXT:    vmovd %ecx, %xmm1
-; AVX512-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $13, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $12, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $11, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $10, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $9, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $8, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $7, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $6, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $5, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $4, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $3, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $2, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftlw $1, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512-NEXT:    kmovw %k0, %eax
-; AVX512-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX512-NEXT:    vpcmpgtw %ymm4, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kshiftlw $14, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    kshiftlw $15, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %ecx
-; AVX512-NEXT:    vmovd %ecx, %xmm0
-; AVX512-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $13, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $12, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $11, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $10, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $9, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $8, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $7, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $6, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $5, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $4, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $3, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $2, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $1, %k0, %k1
-; AVX512-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512-NEXT:    kmovw %k1, %eax
-; AVX512-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512-NEXT:    kmovw %k0, %eax
-; AVX512-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX512-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpcmpgtb %ymm0, %ymm6, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v64i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpcmpgtw %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vpmovsxwd %ymm3, %zmm3
+; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
+; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %ecx
+; AVX512F-NEXT:    vmovd %ecx, %xmm3
+; AVX512F-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
+; AVX512F-NEXT:    vpcmpgtw %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %ecx
+; AVX512F-NEXT:    vmovd %ecx, %xmm2
+; AVX512F-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $7, %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpxor %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpcmpgtw %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %ecx
+; AVX512F-NEXT:    vmovd %ecx, %xmm1
+; AVX512F-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %ecx
+; AVX512F-NEXT:    vmovd %ecx, %xmm0
+; AVX512F-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512F-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v64i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpcmpgtw %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpmovsxwd %ymm3, %zmm3
+; AVX512DQ-NEXT:    vpslld $31, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    vmovd %ecx, %xmm3
+; AVX512DQ-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vpcmpgtw %ymm6, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512DQ-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    vmovd %ecx, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpsllw $7, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpxor %ymm6, %ymm6, %ymm6
+; AVX512DQ-NEXT:    vpcmpgtb %ymm2, %ymm6, %ymm2
+; AVX512DQ-NEXT:    vpcmpgtw %ymm5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    vmovd %ecx, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpcmpgtw %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    vmovd %ecx, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpcmpgtb %ymm0, %ymm6, %ymm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v64i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT:    vpextrw $1, %xmm4, %ecx
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
+; AVX512BW-NEXT:    vpextrw $1, %xmm5, %edx
+; AVX512BW-NEXT:    xorl %eax, %eax
+; AVX512BW-NEXT:    cmpw %cx, %dx
+; AVX512BW-NEXT:    movw $-1, %cx
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vmovd %xmm4, %esi
+; AVX512BW-NEXT:    vmovd %xmm5, %edi
+; AVX512BW-NEXT:    cmpw %si, %di
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgw %cx, %si
+; AVX512BW-NEXT:    vmovd %esi, %xmm6
+; AVX512BW-NEXT:    vpinsrw $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $2, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $2, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $3, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $3, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $3, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $4, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $4, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $4, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $5, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $5, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $5, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $6, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $6, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $6, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $7, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $7, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $7, %edx, %xmm6, %xmm4
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT:    vpextrw $1, %xmm5, %edx
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
+; AVX512BW-NEXT:    vpextrw $1, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vmovd %xmm5, %esi
+; AVX512BW-NEXT:    vmovd %xmm6, %edi
+; AVX512BW-NEXT:    cmpw %si, %di
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgw %cx, %si
+; AVX512BW-NEXT:    vmovd %esi, %xmm7
+; AVX512BW-NEXT:    vpinsrw $1, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $2, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $2, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $2, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $3, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $3, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $3, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $4, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $4, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $4, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $5, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $5, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $5, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $6, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $6, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $6, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $7, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $7, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $7, %edx, %xmm7, %xmm5
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
+; AVX512BW-NEXT:    vpextrw $1, %xmm5, %edx
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
+; AVX512BW-NEXT:    vpextrw $1, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vmovd %xmm5, %esi
+; AVX512BW-NEXT:    vmovd %xmm6, %edi
+; AVX512BW-NEXT:    cmpw %si, %di
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgw %cx, %si
+; AVX512BW-NEXT:    vmovd %esi, %xmm7
+; AVX512BW-NEXT:    vpinsrw $1, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $2, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $2, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $2, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $3, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $3, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $3, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $4, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $4, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $4, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $5, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $5, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $5, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $6, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $6, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $6, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT:    vpextrw $7, %xmm5, %edx
+; AVX512BW-NEXT:    vpextrw $7, %xmm6, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $7, %edx, %xmm7, %xmm5
+; AVX512BW-NEXT:    vpextrw $1, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $1, %xmm0, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vmovd %xmm2, %esi
+; AVX512BW-NEXT:    vmovd %xmm0, %edi
+; AVX512BW-NEXT:    cmpw %si, %di
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgw %cx, %si
+; AVX512BW-NEXT:    vmovd %esi, %xmm6
+; AVX512BW-NEXT:    vpinsrw $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $2, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $2, %xmm0, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $3, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $3, %xmm0, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $3, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $4, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $4, %xmm0, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $4, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $5, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $5, %xmm0, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $5, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $6, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $6, %xmm0, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $6, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $7, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $7, %xmm0, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $7, %edx, %xmm6, %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT:    vpextrw $1, %xmm2, %edx
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; AVX512BW-NEXT:    vpextrw $1, %xmm4, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vmovd %xmm2, %esi
+; AVX512BW-NEXT:    vmovd %xmm4, %edi
+; AVX512BW-NEXT:    cmpw %si, %di
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgw %cx, %si
+; AVX512BW-NEXT:    vmovd %esi, %xmm5
+; AVX512BW-NEXT:    vpinsrw $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $2, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $2, %xmm4, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $3, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $3, %xmm4, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $3, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $4, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $4, %xmm4, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $4, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $5, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $5, %xmm4, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $5, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $6, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $6, %xmm4, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $6, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $7, %xmm2, %edx
+; AVX512BW-NEXT:    vpextrw $7, %xmm4, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $7, %edx, %xmm5, %xmm2
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT:    vpextrw $1, %xmm4, %edx
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; AVX512BW-NEXT:    vpextrw $1, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vmovd %xmm4, %esi
+; AVX512BW-NEXT:    vmovd %xmm5, %edi
+; AVX512BW-NEXT:    cmpw %si, %di
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgw %cx, %si
+; AVX512BW-NEXT:    vmovd %esi, %xmm6
+; AVX512BW-NEXT:    vpinsrw $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $2, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $2, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $3, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $3, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $3, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $4, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $4, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $4, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $5, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $5, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $5, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $6, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $6, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $6, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $7, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $7, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $7, %edx, %xmm6, %xmm4
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
+; AVX512BW-NEXT:    vpextrw $1, %xmm4, %edx
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
+; AVX512BW-NEXT:    vpextrw $1, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vmovd %xmm4, %esi
+; AVX512BW-NEXT:    vmovd %xmm5, %edi
+; AVX512BW-NEXT:    cmpw %si, %di
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgw %cx, %si
+; AVX512BW-NEXT:    vmovd %esi, %xmm6
+; AVX512BW-NEXT:    vpinsrw $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $2, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $2, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $3, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $3, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $3, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $4, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $4, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $4, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $5, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $5, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $5, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $6, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $6, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $6, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT:    vpextrw $7, %xmm4, %edx
+; AVX512BW-NEXT:    vpextrw $7, %xmm5, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $7, %edx, %xmm6, %xmm4
+; AVX512BW-NEXT:    vpextrw $1, %xmm3, %edx
+; AVX512BW-NEXT:    vpextrw $1, %xmm1, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vmovd %xmm3, %esi
+; AVX512BW-NEXT:    vmovd %xmm1, %edi
+; AVX512BW-NEXT:    cmpw %si, %di
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmovgw %cx, %si
+; AVX512BW-NEXT:    vmovd %esi, %xmm5
+; AVX512BW-NEXT:    vpinsrw $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $2, %xmm3, %edx
+; AVX512BW-NEXT:    vpextrw $2, %xmm1, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $3, %xmm3, %edx
+; AVX512BW-NEXT:    vpextrw $3, %xmm1, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $3, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $4, %xmm3, %edx
+; AVX512BW-NEXT:    vpextrw $4, %xmm1, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $4, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $5, %xmm3, %edx
+; AVX512BW-NEXT:    vpextrw $5, %xmm1, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $5, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $6, %xmm3, %edx
+; AVX512BW-NEXT:    vpextrw $6, %xmm1, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgw %cx, %dx
+; AVX512BW-NEXT:    vpinsrw $6, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpextrw $7, %xmm3, %edx
+; AVX512BW-NEXT:    vpextrw $7, %xmm1, %esi
+; AVX512BW-NEXT:    cmpw %dx, %si
+; AVX512BW-NEXT:    cmovgw %cx, %ax
+; AVX512BW-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %1 = icmp sgt <64 x i16> %a0, %a1
   ret <64 x i1> %1
 }
@@ -6240,50 +8373,103 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v128i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
-; AVX512-NEXT:    vpcmpgtb %ymm5, %ymm1, %ymm1
-; AVX512-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm2
-; AVX512-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm3
-; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT:    vpmovsxbd %xmm4, %zmm4
-; AVX512-NEXT:    vpslld $31, %zmm4, %zmm4
-; AVX512-NEXT:    vptestmd %zmm4, %zmm4, %k0
-; AVX512-NEXT:    kmovw %k0, 14(%rdi)
-; AVX512-NEXT:    vpmovsxbd %xmm3, %zmm3
-; AVX512-NEXT:    vpslld $31, %zmm3, %zmm3
-; AVX512-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; AVX512-NEXT:    kmovw %k0, 12(%rdi)
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT:    vpmovsxbd %xmm3, %zmm3
-; AVX512-NEXT:    vpslld $31, %zmm3, %zmm3
-; AVX512-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; AVX512-NEXT:    kmovw %k0, 10(%rdi)
-; AVX512-NEXT:    vpmovsxbd %xmm2, %zmm2
-; AVX512-NEXT:    vpslld $31, %zmm2, %zmm2
-; AVX512-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512-NEXT:    kmovw %k0, 8(%rdi)
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vpmovsxbd %xmm2, %zmm2
-; AVX512-NEXT:    vpslld $31, %zmm2, %zmm2
-; AVX512-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512-NEXT:    kmovw %k0, 6(%rdi)
-; AVX512-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512-NEXT:    vpslld $31, %zmm1, %zmm1
-; AVX512-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512-NEXT:    kmovw %k0, 4(%rdi)
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512-NEXT:    vpslld $31, %zmm1, %zmm1
-; AVX512-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512-NEXT:    kmovw %k0, 2(%rdi)
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kmovw %k0, (%rdi)
-; AVX512-NEXT:    movq %rdi, %rax
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v128i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpcmpgtb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT:    vpmovsxbd %xmm4, %zmm4
+; AVX512F-NEXT:    vpslld $31, %zmm4, %zmm4
+; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; AVX512F-NEXT:    kmovw %k0, 14(%rdi)
+; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
+; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512F-NEXT:    kmovw %k0, 12(%rdi)
+; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
+; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512F-NEXT:    kmovw %k0, 10(%rdi)
+; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512F-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT:    kmovw %k0, 8(%rdi)
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512F-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT:    kmovw %k0, 6(%rdi)
+; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, 4(%rdi)
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, 2(%rdi)
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, (%rdi)
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v128i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpcmpgtb %ymm5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-NEXT:    vpmovsxbd %xmm4, %zmm4
+; AVX512DQ-NEXT:    vpslld $31, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 14(%rdi)
+; AVX512DQ-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512DQ-NEXT:    vpslld $31, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 12(%rdi)
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512DQ-NEXT:    vpslld $31, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 10(%rdi)
+; AVX512DQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512DQ-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 8(%rdi)
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512DQ-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 6(%rdi)
+; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512DQ-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 4(%rdi)
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512DQ-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 2(%rdi)
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v128i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpcmpgtb %zmm3, %zmm1, %k0
+; AVX512BW-NEXT:    vpcmpgtb %zmm2, %zmm0, %k1
+; AVX512BW-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT:    retq
   %1 = icmp sgt <128 x i8> %a0, %a1
   ret <128 x i1> %1
 }
@@ -6781,231 +8967,684 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v32f64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vextractf32x4 $3, %zmm4, %xmm8
-; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm9
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    vucomisd %xmm8, %xmm9
-; AVX512-NEXT:    movq $-1, %rcx
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm10
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512-NEXT:    vucomisd %xmm8, %xmm9
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm8
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
-; AVX512-NEXT:    vextractf32x4 $2, %zmm4, %xmm9
-; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm10
-; AVX512-NEXT:    vucomisd %xmm9, %xmm10
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm11
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
-; AVX512-NEXT:    vucomisd %xmm9, %xmm10
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm9
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512-NEXT:    vextractf32x4 $1, %zmm4, %xmm9
-; AVX512-NEXT:    vextractf32x4 $1, %zmm0, %xmm10
-; AVX512-NEXT:    vucomisd %xmm9, %xmm10
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm11
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
-; AVX512-NEXT:    vucomisd %xmm9, %xmm10
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm9
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
-; AVX512-NEXT:    vucomisd %xmm4, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm10
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512-NEXT:    vucomisd %xmm4, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm8
-; AVX512-NEXT:    vextractf32x4 $3, %zmm5, %xmm4
-; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm0
-; AVX512-NEXT:    vucomisd %xmm4, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm9
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512-NEXT:    vucomisd %xmm4, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0]
-; AVX512-NEXT:    vextractf32x4 $2, %zmm5, %xmm4
-; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
-; AVX512-NEXT:    vucomisd %xmm4, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm10
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512-NEXT:    vucomisd %xmm4, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vextractf32x4 $1, %zmm5, %xmm4
-; AVX512-NEXT:    vextractf32x4 $1, %zmm1, %xmm0
-; AVX512-NEXT:    vucomisd %xmm4, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm10
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512-NEXT:    vucomisd %xmm4, %xmm0
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512-NEXT:    vucomisd %xmm5, %xmm1
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512-NEXT:    vucomisd %xmm5, %xmm1
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm8
-; AVX512-NEXT:    vextractf32x4 $3, %zmm6, %xmm1
-; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
-; AVX512-NEXT:    vucomisd %xmm1, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vucomisd %xmm1, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512-NEXT:    vextractf32x4 $2, %zmm6, %xmm4
-; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
-; AVX512-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    vextractf32x4 $1, %zmm6, %xmm1
-; AVX512-NEXT:    vextractf32x4 $1, %zmm2, %xmm4
-; AVX512-NEXT:    vucomisd %xmm1, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vucomisd %xmm1, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512-NEXT:    vucomisd %xmm6, %xmm2
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512-NEXT:    vucomisd %xmm5, %xmm2
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vextractf32x4 $3, %zmm7, %xmm1
-; AVX512-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
-; AVX512-NEXT:    vucomisd %xmm1, %xmm2
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512-NEXT:    vucomisd %xmm1, %xmm2
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX512-NEXT:    vextractf32x4 $2, %zmm7, %xmm2
-; AVX512-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
-; AVX512-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vextractf32x4 $1, %zmm7, %xmm2
-; AVX512-NEXT:    vextractf32x4 $1, %zmm3, %xmm4
-; AVX512-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512-NEXT:    vucomisd %xmm7, %xmm3
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovaq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm7[1,0]
-; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512-NEXT:    vucomisd %xmm5, %xmm3
-; AVX512-NEXT:    cmovaq %rcx, %rax
-; AVX512-NEXT:    vmovq %rax, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v32f64:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm4, %xmm8
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm0, %xmm9
+; AVX512F-NEXT:    xorl %eax, %eax
+; AVX512F-NEXT:    vucomisd %xmm8, %xmm9
+; AVX512F-NEXT:    movq $-1, %rcx
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm10
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; AVX512F-NEXT:    vucomisd %xmm8, %xmm9
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm4, %xmm9
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm10
+; AVX512F-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm11
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; AVX512F-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm9
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm4, %xmm9
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm0, %xmm10
+; AVX512F-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm11
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; AVX512F-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm9
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm10
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm8
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm5, %xmm4
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm1, %xmm0
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm9
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0]
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm5, %xmm4
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm10
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm5, %xmm4
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm1, %xmm0
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm10
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
+; AVX512F-NEXT:    vucomisd %xmm5, %xmm1
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512F-NEXT:    vucomisd %xmm5, %xmm1
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm8
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm6, %xmm1
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
+; AVX512F-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm6, %xmm4
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm6, %xmm1
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm2, %xmm4
+; AVX512F-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512F-NEXT:    vucomisd %xmm6, %xmm2
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512F-NEXT:    vucomisd %xmm5, %xmm2
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm7, %xmm1
+; AVX512F-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT:    vucomisd %xmm1, %xmm2
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512F-NEXT:    vucomisd %xmm1, %xmm2
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm7, %xmm2
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
+; AVX512F-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm7, %xmm2
+; AVX512F-NEXT:    vextractf32x4 $1, %zmm3, %xmm4
+; AVX512F-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512F-NEXT:    vucomisd %xmm7, %xmm3
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovaq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm7[1,0]
+; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512F-NEXT:    vucomisd %xmm5, %xmm3
+; AVX512F-NEXT:    cmovaq %rcx, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v32f64:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm4, %xmm8
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm0, %xmm9
+; AVX512DQ-NEXT:    xorl %eax, %eax
+; AVX512DQ-NEXT:    vucomisd %xmm8, %xmm9
+; AVX512DQ-NEXT:    movq $-1, %rcx
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm10
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm8, %xmm9
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm8
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm0, %xmm10
+; AVX512DQ-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm11
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm9
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm0, %xmm10
+; AVX512DQ-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm11
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm9
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm10
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm8
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm5, %xmm4
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm1, %xmm0
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm9
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0]
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm5, %xmm4
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm1, %xmm0
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm10
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm5, %xmm4
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm1, %xmm0
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm10
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
+; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm1
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm1
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm1
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm8, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm8
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm6, %xmm1
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm1
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm6, %xmm4
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm6, %xmm1
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm1
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512DQ-NEXT:    vucomisd %xmm6, %xmm2
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm2
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm7, %xmm1
+; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT:    vucomisd %xmm1, %xmm2
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm1, %xmm2
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm1
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm7, %xmm2
+; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm7, %xmm2
+; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512DQ-NEXT:    vucomisd %xmm7, %xmm3
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm7[1,0]
+; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm3
+; AVX512DQ-NEXT:    cmovaq %rcx, %rax
+; AVX512DQ-NEXT:    vmovq %rax, %xmm3
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v32f64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm4, %xmm8
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm0, %xmm9
+; AVX512BW-NEXT:    xorl %eax, %eax
+; AVX512BW-NEXT:    vucomisd %xmm8, %xmm9
+; AVX512BW-NEXT:    movq $-1, %rcx
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm10
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm8, %xmm9
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm4, %xmm9
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, %xmm10
+; AVX512BW-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm11
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm9
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm4, %xmm9
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm0, %xmm10
+; AVX512BW-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm11
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm9, %xmm10
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm9
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm10
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm8
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm5, %xmm4
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm1, %xmm0
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm9
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0]
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm5, %xmm4
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm10
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm5, %xmm4
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm1, %xmm0
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm10
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm0
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
+; AVX512BW-NEXT:    vucomisd %xmm5, %xmm1
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm5, %xmm1
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm8
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm6, %xmm1
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm6, %xmm4
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm6, %xmm1
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm2, %xmm4
+; AVX512BW-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm1, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512BW-NEXT:    vucomisd %xmm6, %xmm2
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm5, %xmm2
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm7, %xmm1
+; AVX512BW-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT:    vucomisd %xmm1, %xmm2
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm1, %xmm2
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm7, %xmm2
+; AVX512BW-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm7, %xmm2
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm3, %xmm4
+; AVX512BW-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512BW-NEXT:    vucomisd %xmm7, %xmm3
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovaq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm7[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512BW-NEXT:    vucomisd %xmm5, %xmm3
+; AVX512BW-NEXT:    cmovaq %rcx, %rax
+; AVX512BW-NEXT:    vmovq %rax, %xmm3
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %1 = fcmp ogt <32 x double> %a0, %a1
   ret <32 x i1> %1
 }
@@ -7639,263 +10278,780 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_cmp_v32i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vextracti32x4 $3, %zmm4, %xmm8
-; AVX512-NEXT:    vpextrq $1, %xmm8, %rcx
-; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm9
-; AVX512-NEXT:    vpextrq $1, %xmm9, %rdx
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    cmpq %rcx, %rdx
-; AVX512-NEXT:    movq $-1, %rcx
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm10
-; AVX512-NEXT:    vmovq %xmm8, %rdx
-; AVX512-NEXT:    vmovq %xmm9, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm8
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
-; AVX512-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
-; AVX512-NEXT:    vpextrq $1, %xmm9, %rdx
-; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm10
-; AVX512-NEXT:    vpextrq $1, %xmm10, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm11
-; AVX512-NEXT:    vmovq %xmm9, %rdx
-; AVX512-NEXT:    vmovq %xmm10, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm9
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512-NEXT:    vextracti32x4 $1, %zmm4, %xmm9
-; AVX512-NEXT:    vpextrq $1, %xmm9, %rdx
-; AVX512-NEXT:    vextracti32x4 $1, %zmm0, %xmm10
-; AVX512-NEXT:    vpextrq $1, %xmm10, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm11
-; AVX512-NEXT:    vmovq %xmm9, %rdx
-; AVX512-NEXT:    vmovq %xmm10, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm9
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm10
-; AVX512-NEXT:    vmovq %xmm4, %rdx
-; AVX512-NEXT:    vmovq %xmm0, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm8
-; AVX512-NEXT:    vextracti32x4 $3, %zmm5, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm9
-; AVX512-NEXT:    vmovq %xmm4, %rdx
-; AVX512-NEXT:    vmovq %xmm0, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0]
-; AVX512-NEXT:    vextracti32x4 $2, %zmm5, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512-NEXT:    vextracti32x4 $2, %zmm1, %xmm0
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm10
-; AVX512-NEXT:    vmovq %xmm4, %rdx
-; AVX512-NEXT:    vmovq %xmm0, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT:    vextracti32x4 $1, %zmm5, %xmm0
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm10
-; AVX512-NEXT:    vmovq %xmm0, %rdx
-; AVX512-NEXT:    vmovq %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vmovq %xmm5, %rdx
-; AVX512-NEXT:    vmovq %xmm1, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm8
-; AVX512-NEXT:    vextracti32x4 $3, %zmm6, %xmm1
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vmovq %xmm1, %rdx
-; AVX512-NEXT:    vmovq %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512-NEXT:    vextracti32x4 $2, %zmm6, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
-; AVX512-NEXT:    vpextrq $1, %xmm5, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vmovq %xmm4, %rdx
-; AVX512-NEXT:    vmovq %xmm5, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vextracti32x4 $1, %zmm6, %xmm0
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512-NEXT:    vextracti32x4 $1, %zmm2, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vmovq %xmm0, %rdx
-; AVX512-NEXT:    vmovq %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; AVX512-NEXT:    vpextrq $1, %xmm6, %rdx
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vmovq %xmm6, %rdx
-; AVX512-NEXT:    vmovq %xmm2, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm1
-; AVX512-NEXT:    vextracti32x4 $3, %zmm7, %xmm0
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vmovq %xmm0, %rdx
-; AVX512-NEXT:    vmovq %xmm2, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512-NEXT:    vextracti32x4 $2, %zmm7, %xmm2
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vmovq %xmm2, %rdx
-; AVX512-NEXT:    vmovq %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm2
-; AVX512-NEXT:    vextracti32x4 $1, %zmm7, %xmm0
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
-; AVX512-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vmovq %xmm0, %rdx
-; AVX512-NEXT:    vmovq %xmm4, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; AVX512-NEXT:    vpextrq $1, %xmm7, %rdx
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movl $0, %edx
-; AVX512-NEXT:    cmovgq %rcx, %rdx
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vmovq %xmm7, %rdx
-; AVX512-NEXT:    vmovq %xmm3, %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    cmovgq %rcx, %rax
-; AVX512-NEXT:    vmovq %rax, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_cmp_v32i64:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm4, %xmm8
+; AVX512F-NEXT:    vpextrq $1, %xmm8, %rcx
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm9
+; AVX512F-NEXT:    vpextrq $1, %xmm9, %rdx
+; AVX512F-NEXT:    xorl %eax, %eax
+; AVX512F-NEXT:    cmpq %rcx, %rdx
+; AVX512F-NEXT:    movq $-1, %rcx
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm10
+; AVX512F-NEXT:    vmovq %xmm8, %rdx
+; AVX512F-NEXT:    vmovq %xmm9, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512F-NEXT:    vpextrq $1, %xmm9, %rdx
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm10
+; AVX512F-NEXT:    vpextrq $1, %xmm10, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm11
+; AVX512F-NEXT:    vmovq %xmm9, %rdx
+; AVX512F-NEXT:    vmovq %xmm10, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm9
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm4, %xmm9
+; AVX512F-NEXT:    vpextrq $1, %xmm9, %rdx
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm0, %xmm10
+; AVX512F-NEXT:    vpextrq $1, %xmm10, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm11
+; AVX512F-NEXT:    vmovq %xmm9, %rdx
+; AVX512F-NEXT:    vmovq %xmm10, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm9
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm10
+; AVX512F-NEXT:    vmovq %xmm4, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm8
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm5, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm9
+; AVX512F-NEXT:    vmovq %xmm4, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm5, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm1, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm10
+; AVX512F-NEXT:    vmovq %xmm4, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm5, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm10
+; AVX512F-NEXT:    vmovq %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vmovq %xmm5, %rdx
+; AVX512F-NEXT:    vmovq %xmm1, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm8
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm6, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vmovq %xmm1, %rdx
+; AVX512F-NEXT:    vmovq %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm6, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vmovq %xmm4, %rdx
+; AVX512F-NEXT:    vmovq %xmm5, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm6, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm2, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vmovq %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm6, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vmovq %xmm6, %rdx
+; AVX512F-NEXT:    vmovq %xmm2, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm1
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm7, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vmovq %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm2, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm7, %xmm2
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vmovq %xmm2, %rdx
+; AVX512F-NEXT:    vmovq %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm7, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
+; AVX512F-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm5
+; AVX512F-NEXT:    vmovq %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm4, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX512F-NEXT:    vpextrq $1, %xmm7, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm3, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    movl $0, %edx
+; AVX512F-NEXT:    cmovgq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vmovq %xmm7, %rdx
+; AVX512F-NEXT:    vmovq %xmm3, %rsi
+; AVX512F-NEXT:    cmpq %rdx, %rsi
+; AVX512F-NEXT:    cmovgq %rcx, %rax
+; AVX512F-NEXT:    vmovq %rax, %xmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_cmp_v32i64:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm4, %xmm8
+; AVX512DQ-NEXT:    vpextrq $1, %xmm8, %rcx
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm0, %xmm9
+; AVX512DQ-NEXT:    vpextrq $1, %xmm9, %rdx
+; AVX512DQ-NEXT:    xorl %eax, %eax
+; AVX512DQ-NEXT:    cmpq %rcx, %rdx
+; AVX512DQ-NEXT:    movq $-1, %rcx
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm10
+; AVX512DQ-NEXT:    vmovq %xmm8, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm9, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm8
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vpextrq $1, %xmm9, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm0, %xmm10
+; AVX512DQ-NEXT:    vpextrq $1, %xmm10, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm11
+; AVX512DQ-NEXT:    vmovq %xmm9, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm10, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm9
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vpextrq $1, %xmm9, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm0, %xmm10
+; AVX512DQ-NEXT:    vpextrq $1, %xmm10, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm11
+; AVX512DQ-NEXT:    vmovq %xmm9, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm10, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm9
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm10
+; AVX512DQ-NEXT:    vmovq %xmm4, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm0, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm8
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm5, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm1, %xmm0
+; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm9
+; AVX512DQ-NEXT:    vmovq %xmm4, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm0, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0]
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm5, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm1, %xmm0
+; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm10
+; AVX512DQ-NEXT:    vmovq %xmm4, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm0, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm5, %xmm0
+; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm1, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm10
+; AVX512DQ-NEXT:    vmovq %xmm0, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vmovq %xmm5, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm1, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm1
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm8, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm8
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm6, %xmm1
+; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vmovq %xmm1, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm1
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm6, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vmovq %xmm4, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm5, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm6, %xmm0
+; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vmovq %xmm0, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX512DQ-NEXT:    vpextrq $1, %xmm6, %rdx
+; AVX512DQ-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vmovq %xmm6, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm2, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm1
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm7, %xmm0
+; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vmovq %xmm0, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm2, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm7, %xmm2
+; AVX512DQ-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vmovq %xmm2, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm7, %xmm0
+; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
+; AVX512DQ-NEXT:    vmovq %xmm0, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm4, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX512DQ-NEXT:    vpextrq $1, %xmm7, %rdx
+; AVX512DQ-NEXT:    vpextrq $1, %xmm3, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    movl $0, %edx
+; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
+; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
+; AVX512DQ-NEXT:    vmovq %xmm7, %rdx
+; AVX512DQ-NEXT:    vmovq %xmm3, %rsi
+; AVX512DQ-NEXT:    cmpq %rdx, %rsi
+; AVX512DQ-NEXT:    cmovgq %rcx, %rax
+; AVX512DQ-NEXT:    vmovq %rax, %xmm3
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: test_cmp_v32i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm4, %xmm8
+; AVX512BW-NEXT:    vpextrq $1, %xmm8, %rcx
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm9
+; AVX512BW-NEXT:    vpextrq $1, %xmm9, %rdx
+; AVX512BW-NEXT:    xorl %eax, %eax
+; AVX512BW-NEXT:    cmpq %rcx, %rdx
+; AVX512BW-NEXT:    movq $-1, %rcx
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm10
+; AVX512BW-NEXT:    vmovq %xmm8, %rdx
+; AVX512BW-NEXT:    vmovq %xmm9, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm8
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512BW-NEXT:    vpextrq $1, %xmm9, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm10
+; AVX512BW-NEXT:    vpextrq $1, %xmm10, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm11
+; AVX512BW-NEXT:    vmovq %xmm9, %rdx
+; AVX512BW-NEXT:    vmovq %xmm10, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm9
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm4, %xmm9
+; AVX512BW-NEXT:    vpextrq $1, %xmm9, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm10
+; AVX512BW-NEXT:    vpextrq $1, %xmm10, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm11
+; AVX512BW-NEXT:    vmovq %xmm9, %rdx
+; AVX512BW-NEXT:    vmovq %xmm10, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm9
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm10
+; AVX512BW-NEXT:    vmovq %xmm4, %rdx
+; AVX512BW-NEXT:    vmovq %xmm0, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm8
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm5, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm9
+; AVX512BW-NEXT:    vmovq %xmm4, %rdx
+; AVX512BW-NEXT:    vmovq %xmm0, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0]
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm5, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm0
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm10
+; AVX512BW-NEXT:    vmovq %xmm4, %rdx
+; AVX512BW-NEXT:    vmovq %xmm0, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm5, %xmm0
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm10
+; AVX512BW-NEXT:    vmovq %xmm0, %rdx
+; AVX512BW-NEXT:    vmovq %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
+; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vmovq %xmm5, %rdx
+; AVX512BW-NEXT:    vmovq %xmm1, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm8
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm6, %xmm1
+; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vmovq %xmm1, %rdx
+; AVX512BW-NEXT:    vmovq %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm6, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm4, %rdx
+; AVX512BW-NEXT:    vmovq %xmm5, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm6, %xmm0
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm2, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vmovq %xmm0, %rdx
+; AVX512BW-NEXT:    vmovq %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX512BW-NEXT:    vpextrq $1, %xmm6, %rdx
+; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vmovq %xmm6, %rdx
+; AVX512BW-NEXT:    vmovq %xmm2, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm1
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm7, %xmm0
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vmovq %xmm0, %rdx
+; AVX512BW-NEXT:    vmovq %xmm2, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm7, %xmm2
+; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vmovq %xmm2, %rdx
+; AVX512BW-NEXT:    vmovq %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm2
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm2
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm7, %xmm0
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
+; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm5
+; AVX512BW-NEXT:    vmovq %xmm0, %rdx
+; AVX512BW-NEXT:    vmovq %xmm4, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX512BW-NEXT:    vpextrq $1, %xmm7, %rdx
+; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    movl $0, %edx
+; AVX512BW-NEXT:    cmovgq %rcx, %rdx
+; AVX512BW-NEXT:    vmovq %rdx, %xmm4
+; AVX512BW-NEXT:    vmovq %xmm7, %rdx
+; AVX512BW-NEXT:    vmovq %xmm3, %rsi
+; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    cmovgq %rcx, %rax
+; AVX512BW-NEXT:    vmovq %rax, %xmm3
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %1 = icmp sgt <32 x i64> %a0, %a1
   ret <32 x i1> %1
 }
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index 39fbc7611de8..774d615ae896 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -1244,8 +1244,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
 ; AVX512F:       # BB#0: # %entry
 ; AVX512F-NEXT:    movzbl (%rdi), %eax
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512F-NEXT:    retq
 ;
@@ -1253,8 +1252,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
 ; AVX512BW:       # BB#0: # %entry
 ; AVX512BW-NEXT:    movzbl (%rdi), %eax
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
@@ -1435,8 +1433,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
 ; AVX512F:       # BB#0: # %entry
 ; AVX512F-NEXT:    movzbl (%rdi), %eax
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512F-NEXT:    retq
@@ -1445,8 +1442,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
 ; AVX512BW:       # BB#0: # %entry
 ; AVX512BW-NEXT:    movzbl (%rdi), %eax
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512BW-NEXT:    retq
@@ -1642,8 +1638,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
 ; AVX512F:       # BB#0: # %entry
 ; AVX512F-NEXT:    movzbl (%rdi), %eax
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512F-NEXT:    retq
 ;
@@ -1651,8 +1646,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
 ; AVX512BW:       # BB#0: # %entry
 ; AVX512BW-NEXT:    movzbl (%rdi), %eax
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
@@ -1945,8 +1939,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
 ; AVX512F:       # BB#0: # %entry
 ; AVX512F-NEXT:    movzbl (%rdi), %eax
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -1954,8 +1947,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
 ; AVX512BW:       # BB#0: # %entry
 ; AVX512BW-NEXT:    movzbl (%rdi), %eax
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -2348,8 +2340,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
 ; AVX512F:       # BB#0: # %entry
 ; AVX512F-NEXT:    movzbl (%rdi), %eax
 ; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2357,8 +2348,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
 ; AVX512BW:       # BB#0: # %entry
 ; AVX512BW-NEXT:    movzbl (%rdi), %eax
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -2860,8 +2850,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
 ; AVX512-LABEL: load_sext_16i1_to_16i8:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    kmovw (%rdi), %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -3398,8 +3387,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
 ; AVX512-LABEL: load_sext_16i1_to_16i16:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    kmovw (%rdi), %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
@@ -4244,12 +4232,11 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    kmovw (%rdi), %k1
 ; AVX512-NEXT:    kmovw 2(%rdi), %k2
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1} {z}
-; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k2} {z}
+; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index 27b65b829923..440faa689fb8 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
@@ -321,13 +322,22 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v8i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v8i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v8i16:
 ; X32-SSE:       # BB#0:
@@ -499,30 +509,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; AVX512-LABEL: var_shift_v16i8:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512-NEXT:    vpsraw $4, %xmm3, %xmm4
-; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX512-NEXT:    vpsraw $2, %xmm3, %xmm4
-; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX512-NEXT:    vpsraw $1, %xmm3, %xmm4
-; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512-NEXT:    vpsraw $4, %xmm0, %xmm3
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpsraw $2, %xmm0, %xmm3
-; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpsraw $1, %xmm0, %xmm3
-; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v16i8:
@@ -911,30 +901,10 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-LABEL: splatvar_shift_v16i8:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512-NEXT:    vpsraw $4, %xmm3, %xmm4
-; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX512-NEXT:    vpsraw $2, %xmm3, %xmm4
-; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX512-NEXT:    vpsraw $1, %xmm3, %xmm4
-; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512-NEXT:    vpsraw $4, %xmm0, %xmm3
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpsraw $2, %xmm0, %xmm3
-; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpsraw $1, %xmm0, %xmm3
-; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
@@ -1221,13 +1191,21 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v8i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v8i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v8i16:
 ; X32-SSE:       # BB#0:
@@ -1384,31 +1362,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ;
 ; AVX512-LABEL: constant_shift_v16i8:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512-NEXT:    vpsraw $4, %xmm3, %xmm4
-; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX512-NEXT:    vpsraw $2, %xmm3, %xmm4
-; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX512-NEXT:    vpsraw $1, %xmm3, %xmm4
-; AVX512-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512-NEXT:    vpsraw $4, %xmm0, %xmm3
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpsraw $2, %xmm0, %xmm3
-; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpsraw $1, %xmm0, %xmm3
-; AVX512-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v16i8:
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index ee1879b6696e..79902acfec24 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Variable Shifts
@@ -212,13 +213,21 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v16i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v16i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -331,33 +340,41 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v32i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512-NEXT:    vpsraw $4, %ymm3, %ymm4
-; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512-NEXT:    vpsraw $2, %ymm3, %ymm4
-; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512-NEXT:    vpsraw $1, %ymm3, %ymm4
-; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512-NEXT:    vpsraw $4, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpsraw $2, %ymm0, %ymm3
-; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm3
-; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v32i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -608,34 +625,43 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatvar_shift_v32i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512-NEXT:    vpsraw $4, %ymm3, %ymm4
-; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512-NEXT:    vpsraw $2, %ymm3, %ymm4
-; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512-NEXT:    vpsraw $1, %ymm3, %ymm4
-; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512-NEXT:    vpsraw $4, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpsraw $2, %ymm0, %ymm3
-; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm3
-; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: splatvar_shift_v32i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -804,13 +830,20 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v16i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v16i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -913,34 +946,41 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v32i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512-NEXT:    vpsraw $4, %ymm3, %ymm4
-; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512-NEXT:    vpsraw $2, %ymm3, %ymm4
-; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512-NEXT:    vpsraw $1, %ymm3, %ymm4
-; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512-NEXT:    vpsraw $4, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpsraw $2, %ymm0, %ymm3
-; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm3
-; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v32i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll
index 1280641c557b..2c9e433cfb2c 100644
--- a/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -26,25 +26,14 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v32i16:
 ; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
-; AVX512DQ-NEXT:    vpsravd %ymm5, %ymm6, %ymm5
-; AVX512DQ-NEXT:    vpsrld $16, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
-; AVX512DQ-NEXT:    vpsravd %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
-; AVX512DQ-NEXT:    vpsravd %ymm2, %ymm5, %ymm2
-; AVX512DQ-NEXT:    vpsrld $16, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
-; AVX512DQ-NEXT:    vpsravd %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpsravd %zmm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT:    vpsravd %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v32i16:
@@ -1025,24 +1014,13 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v32i16:
 ; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
-; AVX512DQ-NEXT:    vpsravd %ymm4, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vpsrld $16, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
-; AVX512DQ-NEXT:    vpsravd %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512DQ-NEXT:    vpsravd %ymm4, %ymm5, %ymm4
-; AVX512DQ-NEXT:    vpsrld $16, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-NEXT:    vpsravd %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpackusdw %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpsravd %zmm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT:    vpsravd %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v32i16:
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 42488f2ec3a7..a7e1a531b659 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
@@ -290,13 +291,22 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v8i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v8i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v8i16:
 ; X32-SSE:       # BB#0:
@@ -417,18 +427,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; AVX512-LABEL: var_shift_v16i8:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $2, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v16i8:
@@ -701,18 +703,10 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-LABEL: splatvar_shift_v16i8:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $2, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
@@ -955,13 +949,21 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v8i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v8i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v8i16:
 ; X32-SSE:       # BB#0:
@@ -1064,19 +1066,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ;
 ; AVX512-LABEL: constant_shift_v16i8:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $2, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v16i8:
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index 5223d7bba353..25667e7d1661 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Variable Shifts
@@ -189,13 +190,21 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v16i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v16i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -275,21 +284,29 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v32i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v32i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -490,22 +507,31 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatvar_shift_v32i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: splatvar_shift_v32i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -659,13 +685,20 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v16i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v16i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -739,22 +772,29 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v32i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v32i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll
index 4c3caf329fb7..3da8f9437e57 100644
--- a/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -27,25 +27,14 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v32i16:
 ; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
-; AVX512DQ-NEXT:    vpsrlvd %ymm5, %ymm6, %ymm5
-; AVX512DQ-NEXT:    vpsrld $16, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
-; AVX512DQ-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
-; AVX512DQ-NEXT:    vpsrlvd %ymm2, %ymm5, %ymm2
-; AVX512DQ-NEXT:    vpsrld $16, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
-; AVX512DQ-NEXT:    vpsrlvd %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v32i16:
@@ -988,24 +977,13 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v32i16:
 ; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
-; AVX512DQ-NEXT:    vpsrlvd %ymm4, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vpsrld $16, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
-; AVX512DQ-NEXT:    vpsrlvd %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512DQ-NEXT:    vpsrlvd %ymm4, %ymm5, %ymm4
-; AVX512DQ-NEXT:    vpsrld $16, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-NEXT:    vpsrlvd %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpackusdw %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v32i16:
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 5c89949e924b..8706078b40c9 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
@@ -245,13 +246,22 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v8i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v8i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v8i16:
 ; X32-SSE:       # BB#0:
@@ -367,17 +377,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; AVX512-LABEL: var_shift_v16i8:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v16i8:
@@ -642,17 +645,10 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-LABEL: splatvar_shift_v16i8:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
-; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
@@ -827,13 +823,18 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v8i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v8i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v8i16:
 ; X32-SSE:       # BB#0:
@@ -919,18 +920,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ;
 ; AVX512-LABEL: constant_shift_v16i8:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
-; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
-; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v16i8:
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index eb52ae3ccaca..a1ef2791c1b0 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
@@ -164,13 +165,21 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v16i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v16i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -240,20 +249,28 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_shift_v32i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: var_shift_v32i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shift_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %shift = shl <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -446,21 +463,30 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatvar_shift_v32i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: splatvar_shift_v32i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: splatvar_shift_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -571,13 +597,18 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v16i16:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v16i16:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -645,21 +676,28 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_shift_v32i8:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
-; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
-; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
-; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512DQ-LABEL: constant_shift_v32i8:
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_shift_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll
index 520c3237a57f..b9c9b56427f1 100644
--- a/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -27,25 +27,14 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v32i16:
 ; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
-; AVX512DQ-NEXT:    vpsllvd %ymm5, %ymm6, %ymm5
-; AVX512DQ-NEXT:    vpsrld $16, %ymm5, %ymm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
-; AVX512DQ-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpackusdw %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
-; AVX512DQ-NEXT:    vpsllvd %ymm2, %ymm5, %ymm2
-; AVX512DQ-NEXT:    vpsrld $16, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
-; AVX512DQ-NEXT:    vpsllvd %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpsrld $16, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQ-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT:    vpsllvd %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v32i16:
diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 2836d69a0fec..f4650ec741a7 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -178,13 +178,8 @@ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX512BW-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48]
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
diff --git a/test/CodeGen/X86/vector-shuffle-masked.ll b/test/CodeGen/X86/vector-shuffle-masked.ll
index 04d6b3733246..37fd022999e4 100644
--- a/test/CodeGen/X86/vector-shuffle-masked.ll
+++ b/test/CodeGen/X86/vector-shuffle-masked.ll
@@ -216,7 +216,8 @@ define <8 x i32> @mask_shuffle_v8i32_23456701(<8 x i32> %a, <8 x i32> %passthru,
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0]
 ; CHECK-NEXT:    kmovb %edi, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -686,3 +687,33 @@ define <2 x double> @mask_cast_extract_v16f32_v2f64_1(<16 x float> %a, <2 x doub
   %res = select <2 x i1> %mask.extract, <2 x double> %shuffle.cast, <2 x double> %passthru
   ret <2 x double> %res
 }
+
+define <2 x double> @broadcast_v4f32_0101_from_v2f32_mask(double* %x, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: broadcast_v4f32_0101_from_v2f32_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
+; CHECK-NEXT:    retq
+  %q = load double, double* %x, align 1
+  %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
+  %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %vecinit2.i, <2 x double> %passthru
+  ret <2 x double> %res
+}
+
+define <2 x double> @broadcast_v4f32_0101_from_v2f32_maskz(double* %x, i8 %mask) {
+; CHECK-LABEL: broadcast_v4f32_0101_from_v2f32_maskz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
+; CHECK-NEXT:    retq
+  %q = load double, double* %x, align 1
+  %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
+  %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %vecinit2.i, <2 x double> zeroinitializer
+  ret <2 x double> %res
+}
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index 3ad92737a2ef..4312b67546d2 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -71,13 +71,12 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
 ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1} {z}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
-; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -101,14 +100,13 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
 ; AVX512F-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2} {z}
-; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1} {z}
-; AVX512F-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
-; AVX512F-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vpslld $31, %zmm2, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -157,13 +155,12 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1} {z}
-; AVX512F-NEXT:    vextracti32x4 $1, %zmm1, %xmm1
-; AVX512F-NEXT:    vpbroadcastq %xmm1, %zmm1
-; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %zmm0
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -185,8 +182,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
@@ -215,8 +211,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
@@ -241,8 +236,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
@@ -271,8 +265,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
 ; AVX512F-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
@@ -301,13 +294,12 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
 ; AVX512F-NEXT:    movb $51, %al
 ; AVX512F-NEXT:    kmovw %eax, %k2
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2} {z}
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
-; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
@@ -337,10 +329,10 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
 ; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1} {z}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
+; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
@@ -367,8 +359,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
 ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -403,9 +394,8 @@ define i64 @shuf64i1_zero(i64 %a) {
 ; AVX512F-NEXT:    andq $-32, %rsp
 ; AVX512F-NEXT:    subq $96, %rsp
 ; AVX512F-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
 ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
diff --git a/test/ExecutionEngine/Interpreter/lit.local.cfg b/test/ExecutionEngine/Interpreter/lit.local.cfg
index 8cbaf03217d5..231d8e22cc6f 100644
--- a/test/ExecutionEngine/Interpreter/lit.local.cfg
+++ b/test/ExecutionEngine/Interpreter/lit.local.cfg
@@ -1,3 +1,3 @@
 # These tests require foreign function calls
-if config.enable_ffi != "ON":
+if not config.enable_ffi:
     config.unsupported = True
diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s
index 3ba95e4d394b..a9ec00939504 100644
--- a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s
+++ b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s
@@ -1,6 +1,11 @@
 # RUN: llvm-mc -triple=aarch64_be-none-linux-gnu -filetype=obj -o %T/be-reloc.o %s
 # RUN: llvm-rtdyld -triple=aarch64_be-none-linux-gnu -verify -dummy-extern f=0x0123456789abcdef -check=%s %T/be-reloc.o
 
+        .globl Q
+        .section .dummy, "ax"
+Q:
+        nop
+
         .text
         .globl  g
         .p2align        2
@@ -23,8 +28,11 @@ g:
         .globl  k
         .p2align        3
 k:
-        .xword  f
+        .xword  f        
         .size   k, 8
+r:
+# R_AARCH64_PREL32: use Q instead of f to fit in 32 bits.
+        .word  Q - .
 
 # LE instructions read as BE
 # rtdyld-check: *{4}(g) = 0x6024e0d2
@@ -32,3 +40,4 @@ k:
 # rtdyld-check: *{4}(g + 8) = 0x6035b1f2
 # rtdyld-check: *{4}(g + 12) = 0xe0bd99f2
 # rtdyld-check: *{8}k = f
+# rtdyld-check: *{4}r = (Q - r)[31:0]
diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_local_branch.s b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_local_branch.s
new file mode 100644
index 000000000000..679930a14e06
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_local_branch.s
@@ -0,0 +1,14 @@
+# RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj -o %T/branch.o %s
+# RUN: llvm-rtdyld -triple=arm64-none-linux-gnu -verify -check=%s %T/branch.o
+
+.globl _main
+.weak _label1
+
+.section .text.1,"ax"
+_label1:
+  nop
+_main:
+  b _label1
+
+## Branch 1 instruction back from _main
+# rtdyld-check: *{4}(_main) = 0x17ffffff
diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s
index c57234a906e3..f9a03ab40667 100644
--- a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s
+++ b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s
@@ -1,6 +1,11 @@
 # RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj -o %T/reloc.o %s
 # RUN: llvm-rtdyld -triple=arm64-none-linux-gnu -verify -dummy-extern f=0x0123456789abcdef -check=%s %T/reloc.o
-
+        
+        .globl Q
+        .section .dummy, "ax"
+Q:
+        nop
+          
         .text
         .globl  g
         .p2align        2
@@ -14,6 +19,18 @@ g:
         movk    x0, #:abs_g1_nc:f
 # R_AARCH64_MOVW_UABS_G0_NC
         movk    x0, #:abs_g0_nc:f
+l:
+# R_AARCH64_LDST32_ABS_LO12_NC
+        ldr s4, [x5, :lo12:a]
+# R_AARCH64_LDST64_ABS_LO12_NC
+        ldr x4, [x5, :lo12:a]
+p:
+# R_AARCH64_ADR_PREL_PG_HI21
+# Test both low and high immediate values
+        adrp x4, a + 20480 # 16384 + 4096
+# Align next label to 16 bytes, so that LDST immediate
+# fields will be non-zero        
+        .align 4
 a:
 # R_AARCH64_ADD_ABS_LO12_NC
         add x0, x0, :lo12:f
@@ -27,13 +44,27 @@ a:
         .p2align        3
 k:
         .xword  f
-        .size   k, 8
+        .size   k, 16
+r:
+# R_AARCH64_PREL32: use Q instead of f to fit in 32 bits.
+        .word  Q - .
 
 # rtdyld-check: *{4}(g) = 0xd2e02460
 # rtdyld-check: *{4}(g + 4) = 0xf2c8ace0
 # rtdyld-check: *{4}(g + 8) = 0xf2b13560
 # rtdyld-check: *{4}(g + 12) = 0xf299bde0
+
+## Check LDST32_ABS_LO12_NC and LDST64_ABS_LO12_NC
+# rtdyld-check: (*{4}l)[21:10] = a[11:2]
+# rtdyld-check: (*{4}(l+4))[21:10] = a[11:3]
+
+## Check ADR_PREL_PG_HI21. Low order bits of immediate value
+## go to bits 30:29. High order bits go to bits 23:5
+# rtdyld-check: (*{4}p)[30:29] = (a - p + 20480)[13:12]
+# rtdyld-check: (*{4}p)[23:5] = (a - p + 20480)[32:14]
+
 # rtdyld-check: *{8}k = f
+# rtdyld-check: *{4}r = (Q - r)[31:0]
 
 ## f & 0xFFF = 0xdef (bits 11:0 of f)
 ## 0xdef << 10 = 0x37bc00
diff --git a/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll b/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll
index 9b0c1ef9b5e0..af4da14d786f 100644
--- a/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll
+++ b/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll
@@ -22,7 +22,7 @@ target triple = "x86_64-apple-macosx10.11.0"
 ; CHECK: @__asan_binder_global = internal global {{.*}} @global {{.*}} [[METADATA]] {{.*}} section "__DATA,__asan_liveness,regular,live_support"
 
 ; Test that there is the flag global variable:
-; CHECK: @__asan_globals_registered = common global i64 0
+; CHECK: @__asan_globals_registered = common hidden global i64 0
 
 ; The binder has to be inserted to llvm.compiler.used to avoid being stripped
 ; during LTO.
diff --git a/test/JitListener/lit.local.cfg b/test/JitListener/lit.local.cfg
index 05f34a744ad6..f485229b01c2 100644
--- a/test/JitListener/lit.local.cfg
+++ b/test/JitListener/lit.local.cfg
@@ -1,3 +1,3 @@
-if not config.root.llvm_use_intel_jitevents == "true":
+if not config.root.llvm_use_intel_jitevents:
     config.unsupported = True
 
diff --git a/test/ThinLTO/X86/Inputs/funcimport-tbaa.ll b/test/ThinLTO/X86/Inputs/funcimport-tbaa.ll
new file mode 100644
index 000000000000..72aea1e5e252
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/funcimport-tbaa.ll
@@ -0,0 +1,11 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+
+define i32 @main() {
+entry:
+  %unused = call float @globalfunc1(i32* null, float*null)
+  ret i32 0
+}
+
+declare float @globalfunc1(i32*, float*)
\ No newline at end of file
diff --git a/test/ThinLTO/X86/Inputs/local_name_conflict1.ll b/test/ThinLTO/X86/Inputs/local_name_conflict1.ll
new file mode 100644
index 000000000000..2ef7bdd3eb7b
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/local_name_conflict1.ll
@@ -0,0 +1,17 @@
+; ModuleID = 'local_name_conflict.o'
+source_filename = "local_name_conflict.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @a() {
+entry:
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; Function Attrs: noinline nounwind uwtable
+define internal i32 @foo() {
+entry:
+  ret i32 1
+}
diff --git a/test/ThinLTO/X86/Inputs/local_name_conflict2.ll b/test/ThinLTO/X86/Inputs/local_name_conflict2.ll
new file mode 100644
index 000000000000..a8c20a29228a
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/local_name_conflict2.ll
@@ -0,0 +1,17 @@
+; ModuleID = 'local_name_conflict.o'
+source_filename = "local_name_conflict.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @b() {
+entry:
+  %call = call i32 @foo()
+  ret i32 %call
+}
+
+; Function Attrs: noinline nounwind uwtable
+define internal i32 @foo() {
+entry:
+  ret i32 2
+}
diff --git a/test/ThinLTO/X86/funcimport-tbaa.ll b/test/ThinLTO/X86/funcimport-tbaa.ll
new file mode 100644
index 000000000000..c3dfd7d90b00
--- /dev/null
+++ b/test/ThinLTO/X86/funcimport-tbaa.ll
@@ -0,0 +1,38 @@
+; We generate invalid TBAA, hence -disable-verify, but this is a convenient way
+; to trigger a metadata lazyloading crash
+
+; RUN: opt -module-summary %s -o %t.bc -bitcode-mdindex-threshold=0 -disable-verify
+; RUN: opt -module-summary %p/Inputs/funcimport-tbaa.ll -o %t2.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t3.bc %t.bc %t2.bc
+
+
+; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc -o - \
+; RUN:  | llvm-dis -o - | FileCheck %s --check-prefix=IMPORTGLOB1
+; IMPORTGLOB1: define available_externally float @globalfunc1
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define float @globalfunc1(i32*, float*) {
+  %3 = load i32, i32* %0, align 4, !tbaa !0
+  %4 = sitofp i32 %3 to float
+  %5 = load float, float* %1, align 4, !tbaa !4
+  %6 = fadd float %4, %5
+  ret float %6
+}
+
+; We need a second function for force the metadata to be emitted in the global block
+define float @globalfunc2(i32*, float*) {
+  %3 = load i32, i32* %0, align 4, !tbaa !0
+  %4 = sitofp i32 %3 to float
+  %5 = load float, float* %1, align 4, !tbaa !4
+  %6 = fadd float %4, %5
+  ret float %6
+}
+
+!0 = !{!1, !4, i64 0}
+!1 = !{!"int", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"float", !2, i64 0}
diff --git a/test/ThinLTO/X86/local_name_conflict.ll b/test/ThinLTO/X86/local_name_conflict.ll
new file mode 100644
index 000000000000..9cbb32ecf211
--- /dev/null
+++ b/test/ThinLTO/X86/local_name_conflict.ll
@@ -0,0 +1,29 @@
+; Do setup work for all below tests: generate bitcode and combined index
+; RUN: opt -module-summary -module-hash %s -o %t.bc
+; RUN: opt -module-summary -module-hash %p/Inputs/local_name_conflict1.ll -o %t2.bc
+; RUN: opt -module-summary -module-hash %p/Inputs/local_name_conflict2.ll -o %t3.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t4.bc %t.bc %t2.bc %t3.bc
+
+; Make sure foo is promoted and renamed without complaint in both
+; Inputs/local_name_conflict1.ll and Inputs/local_name_conflict2.ll
+; FIXME: Once the importer is fixed to import the correct copy of the
+; local, we should be able to verify that via an import action.
+; RUN: llvm-lto -thinlto-action=promote %t2.bc -thinlto-index=%t4.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=EXPORTSTATIC
+; RUN: llvm-lto -thinlto-action=promote %t3.bc -thinlto-index=%t4.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=EXPORTSTATIC
+; EXPORTSTATIC: define hidden i32 @foo.llvm.
+
+; ModuleID = 'local_name_conflict_main.o'
+source_filename = "local_name_conflict_main.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %call = call i32 (...) @b()
+  ret i32 %call
+}
+
+declare i32 @b(...)
diff --git a/test/Transforms/GVN/invariant.group.ll b/test/Transforms/GVN/invariant.group.ll
index 026671a5bdf4..d0b32d7f3dd8 100644
--- a/test/Transforms/GVN/invariant.group.ll
+++ b/test/Transforms/GVN/invariant.group.ll
@@ -344,11 +344,63 @@ _Z1gR1A.exit:                                     ; preds = %0, %5
   ret void
 }
 
+; Check if no optimizations are performed with global pointers.
+; FIXME: we could do the optimizations if we would check if dependency comes
+; from the same function.
+; CHECK-LABEL: define void @testGlobal() {
+define void @testGlobal() {
+; CHECK:  %a = load i8, i8* @unknownPtr, !invariant.group !0
+   %a = load i8, i8* @unknownPtr, !invariant.group !0
+   call void @foo2(i8* @unknownPtr, i8 %a)
+; CHECK:  %1 = load i8, i8* @unknownPtr, !invariant.group !0
+   %1 = load i8, i8* @unknownPtr, !invariant.group !0
+   call void @bar(i8 %1)
+
+   %b0 = bitcast i8* @unknownPtr to i1*
+   call void @fooBit(i1* %b0, i1 1)
+; Adding regex because of canonicalization of bitcasts
+; CHECK: %2 = load i1, i1* {{.*}}, !invariant.group !0
+   %2 = load i1, i1* %b0, !invariant.group !0
+   call void @fooBit(i1* %b0, i1 %2)
+; CHECK:  %3 = load i1, i1* {{.*}}, !invariant.group !0
+   %3 = load i1, i1* %b0, !invariant.group !0
+   call void @fooBit(i1* %b0, i1 %3)
+   ret void
+}
+; And in the case it is not global
+; CHECK-LABEL: define void @testNotGlobal() {
+define void @testNotGlobal() {
+   %a = alloca i8
+   call void @foo(i8* %a)
+; CHECK:  %b = load i8, i8* %a, !invariant.group !0
+   %b = load i8, i8* %a, !invariant.group !0
+   call void @foo2(i8* %a, i8 %b)
+
+   %1 = load i8, i8* %a, !invariant.group !0
+; CHECK: call void @bar(i8 %b)
+   call void @bar(i8 %1)
+
+   %b0 = bitcast i8* %a to i1*
+   call void @fooBit(i1* %b0, i1 1)
+; CHECK: %trunc = trunc i8 %b to i1
+   %2 = load i1, i1* %b0, !invariant.group !0
+; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %trunc)
+   call void @fooBit(i1* %b0, i1 %2)
+   %3 = load i1, i1* %b0, !invariant.group !0
+; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %trunc)
+   call void @fooBit(i1* %b0, i1 %3)
+   ret void
+}
+
+
 declare void @foo(i8*)
+declare void @foo2(i8*, i8)
 declare void @bar(i8)
 declare i8* @getPointer(i8*)
 declare void @_ZN1A3fooEv(%struct.A*)
 declare void @_ZN1AC1Ev(%struct.A*)
+declare void @fooBit(i1*, i1)
+
 declare i8* @llvm.invariant.group.barrier(i8*)
 
 ; Function Attrs: nounwind
diff --git a/test/Transforms/InstCombine/assume.ll b/test/Transforms/InstCombine/assume.ll
index 7987aa242319..6e690426db99 100644
--- a/test/Transforms/InstCombine/assume.ll
+++ b/test/Transforms/InstCombine/assume.ll
@@ -2,7 +2,6 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; Function Attrs: nounwind uwtable
 define i32 @foo1(i32* %a) #0 {
 entry:
   %0 = load i32, i32* %a, align 4
@@ -22,7 +21,6 @@ entry:
   ret i32 %0
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @foo2(i32* %a) #0 {
 entry:
 ; Same check as in @foo1, but make sure it works if the assume is first too.
@@ -40,7 +38,6 @@ entry:
   ret i32 %0
 }
 
-; Function Attrs: nounwind
 declare void @llvm.assume(i1) #1
 
 define i32 @simple(i32 %a) #1 {
@@ -55,7 +52,6 @@ entry:
   ret i32 %a
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @can1(i1 %a, i1 %b, i1 %c) {
 entry:
   %and1 = and i1 %a, %b
@@ -71,7 +67,6 @@ entry:
   ret i32 5
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @can2(i1 %a, i1 %b, i1 %c) {
 entry:
   %v = or i1 %a, %b
@@ -103,7 +98,6 @@ entry:
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @bar2(i32 %a) #0 {
 entry:
 ; CHECK-LABEL: @bar2
@@ -118,7 +112,6 @@ entry:
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @bar3(i32 %a, i1 %x, i1 %y) #0 {
 entry:
   %and1 = and i32 %a, 3
@@ -139,7 +132,6 @@ entry:
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @bar4(i32 %a, i32 %b) {
 entry:
   %and1 = and i32 %b, 3
@@ -160,30 +152,41 @@ entry:
 }
 
 define i32 @icmp1(i32 %a) #0 {
-entry:
+; CHECK-LABEL: @icmp1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 1
+;
   %cmp = icmp sgt i32 %a, 5
   tail call void @llvm.assume(i1 %cmp)
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-
-; CHECK-LABEL: @icmp1
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 1
-
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @icmp2(i32 %a) #0 {
-entry:
+; CHECK-LABEL: @icmp2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 0
+;
   %cmp = icmp sgt i32 %a, 5
   tail call void @llvm.assume(i1 %cmp)
-  %0 = zext i1 %cmp to i32
-  %lnot.ext = xor i32 %0, 1
+  %t0 = zext i1 %cmp to i32
+  %lnot.ext = xor i32 %t0, 1
   ret i32 %lnot.ext
+}
 
-; CHECK-LABEL: @icmp2
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 0
+; FIXME: If the 'not' of a condition is known true, then the condition must be false. 
+
+define i1 @assume_not(i1 %cond) {
+; CHECK-LABEL: @assume_not(
+; CHECK-NEXT:    [[NOTCOND:%.*]] = xor i1 [[COND:%.*]], true
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTCOND]])
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+  %notcond = xor i1 %cond, true
+  call void @llvm.assume(i1 %notcond)
+  ret i1 %cond
 }
 
 declare void @escape(i32* %a)
diff --git a/test/Transforms/InstCombine/assume2.ll b/test/Transforms/InstCombine/assume2.ll
index c41bbaa04eb7..e8fbc049f41a 100644
--- a/test/Transforms/InstCombine/assume2.ll
+++ b/test/Transforms/InstCombine/assume2.ll
@@ -1,170 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; Function Attrs: nounwind
 declare void @llvm.assume(i1) #1
 
-; Function Attrs: nounwind uwtable
 define i32 @test1(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test1
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 5
-
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], 15
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 5
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 5
+;
   %and = and i32 %a, 15
   %cmp = icmp eq i32 %and, 5
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 7
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @test2(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test2
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 2
-
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[A_NOT:%.*]] = or i32 [[A:%.*]], -16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A_NOT]], -6
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 2
+;
   %and = and i32 %a, 15
   %nand = xor i32 %and, -1
   %cmp = icmp eq i32 %nand, 4294967285
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 7
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @test3(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test3
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 5
-
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[V:%.*]] = or i32 [[A:%.*]], -16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V]], -11
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 5
+;
   %v = or i32 %a, 4294967280
   %cmp = icmp eq i32 %v, 4294967285
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 7
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @test4(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test4
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 2
-
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[A_NOT:%.*]] = and i32 [[A:%.*]], 15
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A_NOT]], 10
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 2
+;
   %v = or i32 %a, 4294967280
   %nv = xor i32 %v, -1
   %cmp = icmp eq i32 %nv, 5
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 7
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @test5(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test5
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 4
-
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 4
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 4
+;
   %v = xor i32 %a, 1
   %cmp = icmp eq i32 %v, 5
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 7
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @test6(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test6
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 5
-
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[V_MASK:%.*]] = and i32 [[A:%.*]], 1073741823
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V_MASK]], 5
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 5
+;
   %v = shl i32 %a, 2
   %cmp = icmp eq i32 %v, 20
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 63
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @test7(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test7
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 20
-
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[V_MASK:%.*]] = and i32 [[A:%.*]], -4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V_MASK]], 20
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 20
+;
   %v = lshr i32 %a, 2
   %cmp = icmp eq i32 %v, 5
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 252
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @test8(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test8
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 20
-
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[V_MASK:%.*]] = and i32 [[A:%.*]], -4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V_MASK]], 20
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 20
+;
   %v = lshr i32 %a, 2
   %cmp = icmp eq i32 %v, 5
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 252
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @test9(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test9
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 0
-
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 0
+;
   %cmp = icmp sgt i32 %a, 5
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 2147483648
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @test10(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test10
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 -2147483648
-
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A:%.*]], -1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 -2147483648
+;
   %cmp = icmp sle i32 %a, -2
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 2147483648
   ret i32 %and1
 }
 
-; Function Attrs: nounwind uwtable
 define i32 @test11(i32 %a) #0 {
-entry:
-; CHECK-LABEL: @test11
-; CHECK: call void @llvm.assume
-; CHECK: ret i32 0
-
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], 257
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 0
+;
   %cmp = icmp ule i32 %a, 256
   tail call void @llvm.assume(i1 %cmp)
-
   %and1 = and i32 %a, 3072
   ret i32 %and1
 }
diff --git a/test/Transforms/InstCombine/fabs.ll b/test/Transforms/InstCombine/fabs.ll
index 09bea5895aaf..6b5f5a949530 100644
--- a/test/Transforms/InstCombine/fabs.ll
+++ b/test/Transforms/InstCombine/fabs.ll
@@ -13,7 +13,8 @@ define float @square_fabs_call_f32(float %x) {
 
 ; CHECK-LABEL: square_fabs_call_f32(
 ; CHECK-NEXT: %mul = fmul float %x, %x
-; CHECK-NEXT: ret float %mul
+; CHECK-NEXT: %fabsf = tail call float @fabsf(float %mul)
+; CHECK-NEXT: ret float %fabsf
 }
 
 define double @square_fabs_call_f64(double %x) {
@@ -23,7 +24,8 @@ define double @square_fabs_call_f64(double %x) {
 
 ; CHECK-LABEL: square_fabs_call_f64(
 ; CHECK-NEXT: %mul = fmul double %x, %x
-; CHECK-NEXT: ret double %mul
+; CHECK-NEXT: %fabs = tail call double @fabs(double %mul)
+; CHECK-NEXT: ret double %fabs
 }
 
 define fp128 @square_fabs_call_f128(fp128 %x) {
@@ -33,15 +35,18 @@ define fp128 @square_fabs_call_f128(fp128 %x) {
 
 ; CHECK-LABEL: square_fabs_call_f128(
 ; CHECK-NEXT: %mul = fmul fp128 %x, %x
-; CHECK-NEXT: ret fp128 %mul
+; CHECK-NEXT: %fabsl = tail call fp128 @fabsl(fp128 %mul)
+; CHECK-NEXT: ret fp128 %fabsl
 }
 
-; Make sure all intrinsic calls are eliminated when the input is known positive.
+; Make sure all intrinsic calls are eliminated when the input is known
+; positive.
 
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
 declare fp128 @llvm.fabs.f128(fp128)
 
+; The fabs cannot be eliminated because %x may be a NaN
 define float @square_fabs_intrinsic_f32(float %x) {
   %mul = fmul float %x, %x
   %fabsf = tail call float @llvm.fabs.f32(float %mul)
@@ -49,7 +54,8 @@ define float @square_fabs_intrinsic_f32(float %x) {
 
 ; CHECK-LABEL: square_fabs_intrinsic_f32(
 ; CHECK-NEXT: %mul = fmul float %x, %x
-; CHECK-NEXT: ret float %mul
+; CHECK-NEXT: %fabsf = tail call float @llvm.fabs.f32(float %mul)
+; CHECK-NEXT: ret float %fabsf
 }
 
 define double @square_fabs_intrinsic_f64(double %x) {
@@ -59,7 +65,8 @@ define double @square_fabs_intrinsic_f64(double %x) {
 
 ; CHECK-LABEL: square_fabs_intrinsic_f64(
 ; CHECK-NEXT: %mul = fmul double %x, %x
-; CHECK-NEXT: ret double %mul
+; CHECK-NEXT: %fabs = tail call double @llvm.fabs.f64(double %mul)
+; CHECK-NEXT: ret double %fabs
 }
 
 define fp128 @square_fabs_intrinsic_f128(fp128 %x) {
@@ -69,7 +76,20 @@ define fp128 @square_fabs_intrinsic_f128(fp128 %x) {
 
 ; CHECK-LABEL: square_fabs_intrinsic_f128(
 ; CHECK-NEXT: %mul = fmul fp128 %x, %x
-; CHECK-NEXT: ret fp128 %mul
+; CHECK-NEXT: %fabsl = tail call fp128 @llvm.fabs.f128(fp128 %mul)
+; CHECK-NEXT: ret fp128 %fabsl
+}
+
+; TODO: This should be able to elimnated the fabs
+define float @square_nnan_fabs_intrinsic_f32(float %x) {
+  %mul = fmul nnan float %x, %x
+  %fabsf = call float @llvm.fabs.f32(float %mul)
+  ret float %fabsf
+
+; CHECK-LABEL: square_nnan_fabs_intrinsic_f32(
+; CHECK-NEXT: %mul = fmul nnan float %x, %x
+; CHECK-NEXT: %fabsf = call float @llvm.fabs.f32(float %mul)
+; CHECK-NEXT: ret float %fabsf
 }
 
 ; Shrinking a library call to a smaller type should not be inhibited by nor inhibit the square optimization.
@@ -82,7 +102,10 @@ define float @square_fabs_shrink_call1(float %x) {
   ret float %trunc
 
 ; CHECK-LABEL: square_fabs_shrink_call1(
-; CHECK-NEXT: %trunc = fmul float %x, %x
+; CHECK-NEXT: %ext = fpext float %x to double
+; CHECK-NEXT: %sq = fmul double %ext, %ext
+; CHECK-NEXT: call double @fabs(double %sq)
+; CHECK-NEXT: %trunc = fptrunc double %fabs to float
 ; CHECK-NEXT: ret float %trunc
 }
 
@@ -95,7 +118,8 @@ define float @square_fabs_shrink_call2(float %x) {
 
 ; CHECK-LABEL: square_fabs_shrink_call2(
 ; CHECK-NEXT: %sq = fmul float %x, %x
-; CHECK-NEXT: ret float %sq
+; CHECK-NEXT: %fabsf = call float @fabsf(float %sq)
+; CHECK-NEXT: ret float %fabsf
 }
 
 ; CHECK-LABEL: @fabs_select_constant_negative_positive(
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index 6ccf6e9fa774..84f24ca0bf24 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -672,7 +672,8 @@ define double @sqrt_intrinsic_arg_4th(double %x) {
 
 ; CHECK-LABEL: sqrt_intrinsic_arg_4th(
 ; CHECK-NEXT: %mul = fmul fast double %x, %x
-; CHECK-NEXT: ret double %mul
+; CHECK-NEXT: %fabs = call fast double @llvm.fabs.f64(double %mul)
+; CHECK-NEXT: ret double %fabs
 }
 
 define double @sqrt_intrinsic_arg_5th(double %x) {
@@ -684,8 +685,9 @@ define double @sqrt_intrinsic_arg_5th(double %x) {
 
 ; CHECK-LABEL: sqrt_intrinsic_arg_5th(
 ; CHECK-NEXT: %mul = fmul fast double %x, %x
+; CHECK-NEXT: %fabs = call fast double @llvm.fabs.f64(double %mul)
 ; CHECK-NEXT: %sqrt1 = call fast double @llvm.sqrt.f64(double %x)
-; CHECK-NEXT: %1 = fmul fast double %mul, %sqrt1
+; CHECK-NEXT: %1 = fmul fast double %fabs, %sqrt1
 ; CHECK-NEXT: ret double %1
 }
 
diff --git a/test/Transforms/InstCombine/urem-simplify-bug.ll b/test/Transforms/InstCombine/urem-simplify-bug.ll
index 1220dfdc77f0..4f18f3598540 100644
--- a/test/Transforms/InstCombine/urem-simplify-bug.ll
+++ b/test/Transforms/InstCombine/urem-simplify-bug.ll
@@ -1,32 +1,36 @@
-; RUN: opt < %s -instcombine -S | grep "= or i32 %x, -5"
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
-@.str = internal constant [5 x i8] c"foo\0A\00"		; <[5 x i8]*> [#uses=1]
-@.str1 = internal constant [5 x i8] c"bar\0A\00"		; <[5 x i8]*> [#uses=1]
+@.str = internal constant [5 x i8] c"foo\0A\00"
+@.str1 = internal constant [5 x i8] c"bar\0A\00"
 
 define i32 @main() nounwind  {
 entry:
-	%x = call i32 @func_11( ) nounwind 		; <i32> [#uses=1]
-	%tmp3 = or i32 %x, -5		; <i32> [#uses=1]
-	%tmp5 = urem i32 251, %tmp3		; <i32> [#uses=1]
-	%tmp6 = icmp ne i32 %tmp5, 0		; <i1> [#uses=1]
-	%tmp67 = zext i1 %tmp6 to i32		; <i32> [#uses=1]
-	%tmp9 = urem i32 %tmp67, 95		; <i32> [#uses=1]
-	%tmp10 = and i32 %tmp9, 1		; <i32> [#uses=1]
-	%tmp12 = icmp eq i32 %tmp10, 0		; <i1> [#uses=1]
-	br i1 %tmp12, label %bb14, label %bb
-
-bb:		; preds = %entry
-	br label %bb15
-
-bb14:		; preds = %entry
-	br label %bb15
-
-bb15:		; preds = %bb14, %bb
-	%iftmp.0.0 = phi i8* [ getelementptr ([5 x i8], [5 x i8]* @.str1, i32 0, i32 0), %bb14 ], [ getelementptr ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), %bb ]		; <i8*> [#uses=1]
-	%tmp17 = call i32 (i8*, ...) @printf( i8* %iftmp.0.0 ) nounwind 		; <i32> [#uses=0]
-	ret i32 0
+  %x = call i32 @func_11() nounwind
+  %tmp3 = or i32 %x, -5
+  %tmp5 = urem i32 251, %tmp3
+  %tmp6 = icmp ne i32 %tmp5, 0
+  %tmp67 = zext i1 %tmp6 to i32
+  %tmp9 = urem i32 %tmp67, 95
+  %tmp10 = and i32 %tmp9, 1
+  %tmp12 = icmp eq i32 %tmp10, 0
+  br i1 %tmp12, label %bb14, label %bb
+
+bb:
+  br label %bb15
+
+bb14:
+  br label %bb15
+
+bb15:
+  %iftmp.0.0 = phi i8* [ getelementptr ([5 x i8], [5 x i8]* @.str1, i32 0, i32 0), %bb14 ], [ getelementptr ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), %bb ]
+  %tmp17 = call i32 (i8*, ...) @printf(i8* %iftmp.0.0) nounwind
+  ret i32 0
 }
 
+; CHECK-LABEL: define i32 @main(
+; CHECK: call i32 @func_11()
+; CHECK-NEXT: br i1 false, label %bb14, label %bb
+
 declare i32 @func_11()
 
-declare i32 @printf(i8*, ...) nounwind 
+declare i32 @printf(i8*, ...) nounwind
diff --git a/test/Transforms/InstSimplify/div.ll b/test/Transforms/InstSimplify/div.ll
new file mode 100644
index 000000000000..b8ce34aaa37e
--- /dev/null
+++ b/test/Transforms/InstSimplify/div.ll
@@ -0,0 +1,15 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+declare i32 @external()
+
+define i32 @div1() {
+; CHECK-LABEL: @div1(
+; CHECK:         [[CALL:%.*]] = call i32 @external(), !range !0
+; CHECK-NEXT:    ret i32 0
+;
+  %call = call i32 @external(), !range !0
+  %urem = udiv i32 %call, 3
+  ret i32 %urem
+}
+
+!0 = !{i32 0, i32 3}
diff --git a/test/Transforms/InstSimplify/rem.ll b/test/Transforms/InstSimplify/rem.ll
index df3f659b782e..c73d34346ded 100644
--- a/test/Transforms/InstSimplify/rem.ll
+++ b/test/Transforms/InstSimplify/rem.ll
@@ -49,3 +49,17 @@ define i32 @rem3(i32 %x, i32 %n) {
  %mod1 = urem i32 %mod, %n
  ret i32 %mod1
 }
+
+declare i32 @external()
+
+define i32 @rem4() {
+; CHECK-LABEL: @rem4(
+; CHECK:         [[CALL:%.*]] = call i32 @external(), !range !0
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+  %call = call i32 @external(), !range !0
+  %urem = urem i32 %call, 3
+  ret i32 %urem
+}
+
+!0 = !{i32 0, i32 3}
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index cb6981ede1e7..c61131b476b9 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -5,6 +5,8 @@
 
 declare void @foo()
 
+declare i32 @llvm.bitreverse.i32(i32)
+
 ; This testcase tests for a problem where LICM hoists 
 ; potentially trapping instructions when they are not guaranteed to execute.
 define i32 @test1(i1 %c) {
@@ -122,3 +124,28 @@ then:                                             ; preds = %tailrecurse
 ifend:                                            ; preds = %tailrecurse
   ret { i32*, i32 } %d
 }
+
+; CHECK: define i32 @hoist_bitreverse(i32)
+; CHECK: bitreverse
+; CHECK: br label %header
+define i32 @hoist_bitreverse(i32)  {
+  br label %header
+
+header:
+  %sum = phi i32 [ 0, %1 ], [ %5, %latch ]
+  %2 = phi i32 [ 0, %1 ], [ %6, %latch ]
+  %3 = icmp slt i32 %2, 1024
+  br i1 %3, label %body, label %return
+
+body:
+  %4 = call i32 @llvm.bitreverse.i32(i32 %0)
+  %5 = add i32 %sum, %4
+  br label %latch
+
+latch:
+  %6 = add nsw i32 %2, 1
+  br label %header
+
+return:
+  ret i32 %sum
+}
diff --git a/test/Transforms/LoopLoadElim/forward.ll b/test/Transforms/LoopLoadElim/forward.ll
index ed0d162ab7e3..9a0e03a317c8 100644
--- a/test/Transforms/LoopLoadElim/forward.ll
+++ b/test/Transforms/LoopLoadElim/forward.ll
@@ -16,8 +16,8 @@ define void @f(i32* %A, i32* %B, i32* %C, i64 %N) {
 ; CHECK-NOT: %found.conflict{{.*}} =
 
 entry:
-; for.body.ph:
-; CHECK: %load_initial = load i32, i32* %A
+; Make sure the hoisted load keeps the alignment
+; CHECK: %load_initial = load i32, i32* %A, align 1
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
@@ -34,7 +34,7 @@ for.body:                                         ; preds = %for.body, %entry
   %a_p1 = add i32 %b, 2
   store i32 %a_p1, i32* %Aidx_next, align 4
 
-  %a = load i32, i32* %Aidx, align 4
+  %a = load i32, i32* %Aidx, align 1
 ; CHECK: %c = mul i32 %store_forwarded, 2
   %c = mul i32 %a, 2
   store i32 %c, i32* %Cidx, align 4
diff --git a/test/Transforms/LoopVectorize/iv_outside_user.ll b/test/Transforms/LoopVectorize/iv_outside_user.ll
index d536d1023f41..8a44af96e7f4 100644
--- a/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -133,3 +133,48 @@ for.end:
   store i32 %phi2, i32* %p
   ret i32 %phi
 }
+
+; CHECK-LABEL: @PR30742
+; CHECK: min.iters.checked
+; CHECK:   %[[N_MOD_VF:.+]] = urem i32 %[[T5:.+]], 2
+; CHECK:   %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]]
+; CHECK: middle.block
+; CHECK:   %[[CMP:.+]] = icmp eq i32 %[[T5]], %[[N_VEC]]
+; CHECK:   %[[T15:.+]] = add i32 %tmp03, -7
+; CHECK:   %[[T16:.+]] = shl i32 %[[N_MOD_VF]], 3
+; CHECK:   %[[T17:.+]] = add i32 %[[T15]], %[[T16]]
+; CHECK:   %[[T18:.+]] = shl i32 {{.*}}, 3
+; CHECK:   %ind.escape = sub i32 %[[T17]], %[[T18]]
+; CHECK:   br i1 %[[CMP]], label %BB3, label %scalar.ph
+define void @PR30742() {
+BB0:
+  br label %BB1
+
+BB1:
+  %tmp00 = load i32, i32* undef, align 16
+  %tmp01 = sub i32 %tmp00, undef
+  %tmp02 = icmp slt i32 %tmp01, 1
+  %tmp03 = select i1 %tmp02, i32 1, i32 %tmp01
+  %tmp04 = add nsw i32 %tmp03, -7
+  br label %BB2
+
+BB2:
+  %tmp05 = phi i32 [ %tmp04, %BB1 ], [ %tmp06, %BB2 ]
+  %tmp06 = add i32 %tmp05, -8
+  %tmp07 = icmp sgt i32 %tmp06, 0
+  br i1 %tmp07, label %BB2, label %BB3
+
+BB3:
+  %tmp08 = phi i32 [ %tmp05, %BB2 ]
+  %tmp09 = sub i32 %tmp00, undef
+  %tmp10 = icmp slt i32 %tmp09, 1
+  %tmp11 = select i1 %tmp10, i32 1, i32 %tmp09
+  %tmp12 = add nsw i32 %tmp11, -7
+  br label %BB4
+
+BB4:
+  %tmp13 = phi i32 [ %tmp12, %BB3 ], [ %tmp14, %BB4 ]
+  %tmp14 = add i32 %tmp13, -8
+  %tmp15 = icmp sgt i32 %tmp14, 0
+  br i1 %tmp15, label %BB4, label %BB1
+}
diff --git a/test/Transforms/NewGVN/basic-cyclic-opt.ll b/test/Transforms/NewGVN/basic-cyclic-opt.ll
new file mode 100644
index 000000000000..523ed2612e3c
--- /dev/null
+++ b/test/Transforms/NewGVN/basic-cyclic-opt.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+;; Function Attrs: nounwind ssp uwtable
+;; We should eliminate the sub, and one of the phi nodes
+define void @vnum_test1(i32* %data) #0 {
+; CHECK-LABEL: @vnum_test1(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[M_0:%.*]] = phi i32 [ [[TMP3]], [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB17:%.*]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP18:%.*]], [[BB17]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[I_0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BB6:%.*]], label [[BB19:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 [[TMP9]]
+; CHECK-NEXT:    store i32 2, i32* [[TMP10]], align 4
+; CHECK-NEXT:    store i32 0, i32* [[DATA]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15]] = add nsw i32 [[M_0]], [[TMP14]]
+; CHECK-NEXT:    br label [[BB17]]
+; CHECK:       bb17:
+; CHECK-NEXT:    [[TMP18]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[BB4]]
+; CHECK:       bb19:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = getelementptr inbounds i32, i32* %data, i64 3
+  %tmp1 = load i32, i32* %tmp, align 4
+  %tmp2 = getelementptr inbounds i32, i32* %data, i64 4
+  %tmp3 = load i32, i32* %tmp2, align 4
+  br label %bb4
+
+bb4:                                              ; preds = %bb17, %bb
+  %m.0 = phi i32 [ %tmp3, %bb ], [ %tmp15, %bb17 ]
+  %i.0 = phi i32 [ 0, %bb ], [ %tmp18, %bb17 ]
+  %n.0 = phi i32 [ %tmp3, %bb ], [ %tmp16, %bb17 ]
+  %tmp5 = icmp slt i32 %i.0, %tmp1
+  br i1 %tmp5, label %bb6, label %bb19
+
+bb6:                                              ; preds = %bb4
+  %tmp7 = getelementptr inbounds i32, i32* %data, i64 2
+  %tmp8 = load i32, i32* %tmp7, align 4
+  %tmp9 = sext i32 %tmp8 to i64
+  %tmp10 = getelementptr inbounds i32, i32* %data, i64 %tmp9
+  store i32 2, i32* %tmp10, align 4
+  %tmp11 = sub nsw i32 %m.0, %n.0
+  %tmp12 = getelementptr inbounds i32, i32* %data, i64 0
+  store i32 %tmp11, i32* %tmp12, align 4
+  %tmp13 = getelementptr inbounds i32, i32* %data, i64 1
+  %tmp14 = load i32, i32* %tmp13, align 4
+  %tmp15 = add nsw i32 %m.0, %tmp14
+  %tmp16 = add nsw i32 %n.0, %tmp14
+  br label %bb17
+
+bb17:                                             ; preds = %bb6
+  %tmp18 = add nsw i32 %i.0, 1
+  br label %bb4
+
+bb19:                                             ; preds = %bb4
+  ret void
+}
+
+;; Function Attrs: nounwind ssp uwtable
+;; We should eliminate the sub, one of the phi nodes, prove the store of the sub
+;; and the load of data are equivalent, that the load always produces constant 0, and
+;; delete the load replacing it with constant 0.
+define i32 @vnum_test2(i32* %data) #0 {
+; CHECK-LABEL: @vnum_test2(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[M_0:%.*]] = phi i32 [ [[TMP3]], [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB19:%.*]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP20:%.*]], [[BB19]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[I_0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BB6:%.*]], label [[BB21:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 [[TMP9]]
+; CHECK-NEXT:    store i32 2, i32* [[TMP10]], align 4
+; CHECK-NEXT:    store i32 0, i32* [[DATA]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15]] = add nsw i32 [[M_0]], [[TMP14]]
+; CHECK-NEXT:    br label [[BB19]]
+; CHECK:       bb19:
+; CHECK-NEXT:    [[TMP20]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[BB4]]
+; CHECK:       bb21:
+; CHECK-NEXT:    ret i32 0
+;
+bb:
+  %tmp = getelementptr inbounds i32, i32* %data, i64 3
+  %tmp1 = load i32, i32* %tmp, align 4
+  %tmp2 = getelementptr inbounds i32, i32* %data, i64 4
+  %tmp3 = load i32, i32* %tmp2, align 4
+  br label %bb4
+
+bb4:                                              ; preds = %bb19, %bb
+  %m.0 = phi i32 [ %tmp3, %bb ], [ %tmp15, %bb19 ]
+  %n.0 = phi i32 [ %tmp3, %bb ], [ %tmp16, %bb19 ]
+  %i.0 = phi i32 [ 0, %bb ], [ %tmp20, %bb19 ]
+  %p.0 = phi i32 [ undef, %bb ], [ %tmp18, %bb19 ]
+  %tmp5 = icmp slt i32 %i.0, %tmp1
+  br i1 %tmp5, label %bb6, label %bb21
+
+bb6:                                              ; preds = %bb4
+  %tmp7 = getelementptr inbounds i32, i32* %data, i64 2
+  %tmp8 = load i32, i32* %tmp7, align 4
+  %tmp9 = sext i32 %tmp8 to i64
+  %tmp10 = getelementptr inbounds i32, i32* %data, i64 %tmp9
+  store i32 2, i32* %tmp10, align 4
+  %tmp11 = sub nsw i32 %m.0, %n.0
+  %tmp12 = getelementptr inbounds i32, i32* %data, i64 0
+  store i32 %tmp11, i32* %tmp12, align 4
+  %tmp13 = getelementptr inbounds i32, i32* %data, i64 1
+  %tmp14 = load i32, i32* %tmp13, align 4
+  %tmp15 = add nsw i32 %m.0, %tmp14
+  %tmp16 = add nsw i32 %n.0, %tmp14
+  %tmp17 = getelementptr inbounds i32, i32* %data, i64 0
+  %tmp18 = load i32, i32* %tmp17, align 4
+  br label %bb19
+
+bb19:                                             ; preds = %bb6
+  %tmp20 = add nsw i32 %i.0, 1
+  br label %bb4
+
+bb21:                                             ; preds = %bb4
+  ret i32 %p.0
+}
+
+
+; Function Attrs: nounwind ssp uwtable
+;; Same as test 2, with a conditional store of m-n, so it has to also discover
+;; that data ends up with the same value no matter what branch is taken.
+define i32 @vnum_test3(i32* %data) #0 {
+; CHECK-LABEL: @vnum_test3(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[N_0:%.*]] = phi i32 [ [[TMP3]], [[BB:%.*]] ], [ [[TMP19:%.*]], [[BB21:%.*]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP22:%.*]], [[BB21]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[I_0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BB6:%.*]], label [[BB23:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 5
+; CHECK-NEXT:    store i32 0, i32* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i32 [[I_0]], 30
+; CHECK-NEXT:    br i1 [[TMP10]], label [[BB11:%.*]], label [[BB14:%.*]]
+; CHECK:       bb11:
+; CHECK-NEXT:    store i32 0, i32* [[TMP9]], align 4
+; CHECK-NEXT:    br label [[BB14]]
+; CHECK:       bb14:
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP19]] = add nsw i32 [[N_0]], [[TMP18]]
+; CHECK-NEXT:    br label [[BB21]]
+; CHECK:       bb21:
+; CHECK-NEXT:    [[TMP22]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[BB4]]
+; CHECK:       bb23:
+; CHECK-NEXT:    ret i32 0
+;
+bb:
+  %tmp = getelementptr inbounds i32, i32* %data, i64 3
+  %tmp1 = load i32, i32* %tmp, align 4
+  %tmp2 = getelementptr inbounds i32, i32* %data, i64 4
+  %tmp3 = load i32, i32* %tmp2, align 4
+  br label %bb4
+
+bb4:                                              ; preds = %bb21, %bb
+  %n.0 = phi i32 [ %tmp3, %bb ], [ %tmp20, %bb21 ]
+  %m.0 = phi i32 [ %tmp3, %bb ], [ %tmp19, %bb21 ]
+  %p.0 = phi i32 [ 0, %bb ], [ %tmp16, %bb21 ]
+  %i.0 = phi i32 [ 0, %bb ], [ %tmp22, %bb21 ]
+  %tmp5 = icmp slt i32 %i.0, %tmp1
+  br i1 %tmp5, label %bb6, label %bb23
+
+bb6:                                              ; preds = %bb4
+  %tmp7 = getelementptr inbounds i32, i32* %data, i64 2
+  %tmp8 = load i32, i32* %tmp7, align 4
+  %tmp9 = getelementptr inbounds i32, i32* %data, i64 5
+  store i32 0, i32* %tmp9, align 4
+  %tmp10 = icmp slt i32 %i.0, 30
+  br i1 %tmp10, label %bb11, label %bb14
+
+bb11:                                             ; preds = %bb6
+  %tmp12 = sub nsw i32 %m.0, %n.0
+  %tmp13 = getelementptr inbounds i32, i32* %data, i64 5
+  store i32 %tmp12, i32* %tmp13, align 4
+  br label %bb14
+
+bb14:                                             ; preds = %bb11, %bb6
+  %tmp15 = getelementptr inbounds i32, i32* %data, i64 5
+  %tmp16 = load i32, i32* %tmp15, align 4
+  %tmp17 = getelementptr inbounds i32, i32* %data, i64 1
+  %tmp18 = load i32, i32* %tmp17, align 4
+  %tmp19 = add nsw i32 %m.0, %tmp18
+  %tmp20 = add nsw i32 %n.0, %tmp18
+  br label %bb21
+
+bb21:                                             ; preds = %bb14
+  %tmp22 = add nsw i32 %i.0, 1
+  br label %bb4
+
+bb23:                                             ; preds = %bb4
+  ret i32 %p.0
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0, !0, !0}
+
+!0 = !{!"Apple LLVM version 6.0 (clang-600.0.56) (based on LLVM 3.5svn)"}
diff --git a/test/Transforms/NewGVN/cyclic-phi-handling.ll b/test/Transforms/NewGVN/cyclic-phi-handling.ll
new file mode 100644
index 000000000000..283c78548995
--- /dev/null
+++ b/test/Transforms/NewGVN/cyclic-phi-handling.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i32 %arg, i32 %arg1, i32 (i32, i32)* %arg2) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label %bb3
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ %arg1, %bb ], [ [[TMP:%.*]]4, %bb7 ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ %arg, %bb ], [ [[TMP]], %bb7 ]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 %arg2(i32 [[TMP4]], i32 [[TMP]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[TMP6]], label %bb7, label %bb8
+; CHECK:       bb7:
+; CHECK-NEXT:    br label %bb3
+; CHECK:       bb8:
+; CHECK-NEXT:    ret void
+;
+bb:
+  br label %bb3
+
+;; While non-standard, llvm allows mutually dependent phi nodes
+;; Ensure we do not infinite loop trying to process them
+bb3:                                              ; preds = %bb7, %bb
+  %tmp = phi i32 [ %arg1, %bb ], [ %tmp4, %bb7 ]
+  %tmp4 = phi i32 [ %arg, %bb ], [ %tmp, %bb7 ]
+  %tmp5 = call i32 %arg2(i32 %tmp4, i32 %tmp)
+  %tmp6 = icmp ne i32 %tmp5, 0
+  br i1 %tmp6, label %bb7, label %bb8
+
+bb7:                                              ; preds = %bb3
+  br label %bb3
+
+bb8:                                              ; preds = %bb3
+  ret void
+}
diff --git a/test/Transforms/NewGVN/invariant.group.ll b/test/Transforms/NewGVN/invariant.group.ll
index 2bddc99c8b85..80c6e05a8e24 100644
--- a/test/Transforms/NewGVN/invariant.group.ll
+++ b/test/Transforms/NewGVN/invariant.group.ll
@@ -345,11 +345,63 @@ _Z1gR1A.exit:                                     ; preds = %0, %5
   ret void
 }
 
+; Check if no optimizations are performed with global pointers.
+; FIXME: we could do the optimizations if we would check if dependency comes
+; from the same function.
+; CHECK-LABEL: define void @testGlobal() {
+define void @testGlobal() {
+; CHECK:  %a = load i8, i8* @unknownPtr, !invariant.group !0
+   %a = load i8, i8* @unknownPtr, !invariant.group !0
+   call void @foo2(i8* @unknownPtr, i8 %a)
+; CHECK:  %1 = load i8, i8* @unknownPtr, !invariant.group !0
+   %1 = load i8, i8* @unknownPtr, !invariant.group !0
+   call void @bar(i8 %1)
+
+   %b0 = bitcast i8* @unknownPtr to i1*
+   call void @fooBit(i1* %b0, i1 1)
+; Adding regex because of canonicalization of bitcasts
+; CHECK: %2 = load i1, i1* {{.*}}, !invariant.group !0
+   %2 = load i1, i1* %b0, !invariant.group !0
+   call void @fooBit(i1* %b0, i1 %2)
+; CHECK:  %3 = load i1, i1* {{.*}}, !invariant.group !0
+   %3 = load i1, i1* %b0, !invariant.group !0
+   call void @fooBit(i1* %b0, i1 %3)
+   ret void
+}
+; And in the case it is not global
+; CHECK-LABEL: define void @testNotGlobal() {
+define void @testNotGlobal() {
+   %a = alloca i8
+   call void @foo(i8* %a)
+; CHECK:  %b = load i8, i8* %a, !invariant.group !0
+   %b = load i8, i8* %a, !invariant.group !0
+   call void @foo2(i8* %a, i8 %b)
+
+   %1 = load i8, i8* %a, !invariant.group !0
+; CHECK: call void @bar(i8 %b)
+   call void @bar(i8 %1)
+
+   %b0 = bitcast i8* %a to i1*
+   call void @fooBit(i1* %b0, i1 1)
+; CHECK: %trunc = trunc i8 %b to i1
+   %2 = load i1, i1* %b0, !invariant.group !0
+; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %trunc)
+   call void @fooBit(i1* %b0, i1 %2)
+   %3 = load i1, i1* %b0, !invariant.group !0
+; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %trunc)
+   call void @fooBit(i1* %b0, i1 %3)
+   ret void
+}
+
+
 declare void @foo(i8*)
+declare void @foo2(i8*, i8)
 declare void @bar(i8)
 declare i8* @getPointer(i8*)
 declare void @_ZN1A3fooEv(%struct.A*)
 declare void @_ZN1AC1Ev(%struct.A*)
+declare void @fooBit(i1*, i1)
+
 declare i8* @llvm.invariant.group.barrier(i8*)
 
 ; Function Attrs: nounwind
diff --git a/test/Transforms/NewGVN/memory-handling.ll b/test/Transforms/NewGVN/memory-handling.ll
new file mode 100644
index 000000000000..a0c4a998b8b6
--- /dev/null
+++ b/test/Transforms/NewGVN/memory-handling.ll
@@ -0,0 +1,195 @@
+;; This test is really dependent on propagating a lot of memory info around, but in the end, not
+;; screwing up a single add.
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.Letter = type { i32, i32, i32, i32 }
+
+@alPhrase = external local_unnamed_addr global [26 x %struct.Letter], align 16
+@aqMainMask = external local_unnamed_addr global [2 x i64], align 16
+@aqMainSign = external local_unnamed_addr global [2 x i64], align 16
+@cchPhraseLength = external local_unnamed_addr global i32, align 4
+@auGlobalFrequency = external local_unnamed_addr global [26 x i32], align 16
+@.str.7 = external hidden unnamed_addr constant [28 x i8], align 1
+
+; Function Attrs: nounwind uwtable
+declare void @Fatal(i8*, i32) local_unnamed_addr #0
+
+; Function Attrs: nounwind readnone
+declare i16** @__ctype_b_loc() local_unnamed_addr #1
+
+; Function Attrs: nounwind uwtable
+define void @BuildMask(i8* nocapture readonly) local_unnamed_addr #0 {
+  tail call void @llvm.memset.p0i8.i64(i8* bitcast ([26 x %struct.Letter]* @alPhrase to i8*), i8 0, i64 416, i32 16, i1 false)
+  tail call void @llvm.memset.p0i8.i64(i8* bitcast ([2 x i64]* @aqMainMask to i8*), i8 0, i64 16, i32 16, i1 false)
+  tail call void @llvm.memset.p0i8.i64(i8* bitcast ([2 x i64]* @aqMainSign to i8*), i8 0, i64 16, i32 16, i1 false)
+  br label %.sink.split
+
+.sink.split:                                      ; preds = %14, %1
+  %.0 = phi i8* [ %0, %1 ], [ %.lcssa67, %14 ]
+  %.sink = phi i32 [ 0, %1 ], [ %23, %14 ]
+  store i32 %.sink, i32* @cchPhraseLength, align 4, !tbaa !1
+  br label %2
+
+; <label>:2:                                      ; preds = %6, %.sink.split
+  %.1 = phi i8* [ %.0, %.sink.split ], [ %3, %6 ]
+  %3 = getelementptr inbounds i8, i8* %.1, i64 1
+  %4 = load i8, i8* %.1, align 1, !tbaa !5
+  %5 = icmp eq i8 %4, 0
+  br i1 %5, label %.preheader.preheader, label %6
+
+.preheader.preheader:                             ; preds = %2
+  br label %.preheader
+
+; <label>:6:                                      ; preds = %2
+  %7 = tail call i16** @__ctype_b_loc() #4
+  %8 = load i16*, i16** %7, align 8, !tbaa !6
+  %9 = sext i8 %4 to i64
+  %10 = getelementptr inbounds i16, i16* %8, i64 %9
+  %11 = load i16, i16* %10, align 2, !tbaa !8
+  %12 = and i16 %11, 1024
+  %13 = icmp eq i16 %12, 0
+  br i1 %13, label %2, label %14
+
+; <label>:14:                                     ; preds = %6
+  %.lcssa67 = phi i8* [ %3, %6 ]
+  %.lcssa65 = phi i8 [ %4, %6 ]
+  %15 = sext i8 %.lcssa65 to i32
+  %16 = tail call i32 @tolower(i32 %15) #5
+  %17 = add nsw i32 %16, -97
+  %18 = sext i32 %17 to i64
+  %19 = getelementptr inbounds [26 x %struct.Letter], [26 x %struct.Letter]* @alPhrase, i64 0, i64 %18, i32 0
+  %20 = load i32, i32* %19, align 16, !tbaa !10
+  %21 = add i32 %20, 1
+  store i32 %21, i32* %19, align 16, !tbaa !10
+  %22 = load i32, i32* @cchPhraseLength, align 4, !tbaa !1
+  %23 = add nsw i32 %22, 1
+  br label %.sink.split
+
+.preheader:                                       ; preds = %58, %.preheader.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader.preheader ], [ %indvars.iv.next, %58 ]
+  %.04961 = phi i32 [ %.2, %58 ], [ 0, %.preheader.preheader ]
+  %.05160 = phi i32 [ %.253, %58 ], [ 0, %.preheader.preheader ]
+  %24 = getelementptr inbounds [26 x %struct.Letter], [26 x %struct.Letter]* @alPhrase, i64 0, i64 %indvars.iv, i32 0
+  %25 = load i32, i32* %24, align 16, !tbaa !10
+  %26 = icmp eq i32 %25, 0
+  %27 = getelementptr inbounds [26 x i32], [26 x i32]* @auGlobalFrequency, i64 0, i64 %indvars.iv
+  br i1 %26, label %28, label %29
+
+; <label>:28:                                     ; preds = %.preheader
+  store i32 -1, i32* %27, align 4, !tbaa !1
+  br label %58
+
+; <label>:29:                                     ; preds = %.preheader
+  store i32 0, i32* %27, align 4, !tbaa !1
+  %30 = zext i32 %25 to i64
+  br i1 false, label %._crit_edge, label %.lr.ph.preheader
+
+.lr.ph.preheader:                                 ; preds = %29
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
+  %.04658 = phi i64 [ %32, %.lr.ph ], [ 1, %.lr.ph.preheader ]
+  %.04857 = phi i32 [ %31, %.lr.ph ], [ 1, %.lr.ph.preheader ]
+  %31 = add nuw nsw i32 %.04857, 1
+  %32 = shl i64 %.04658, 1
+  %33 = icmp ult i64 %30, %32
+  br i1 %33, label %._crit_edge.loopexit, label %.lr.ph
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  %.lcssa63 = phi i32 [ %31, %.lr.ph ]
+  %.lcssa = phi i64 [ %32, %.lr.ph ]
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %29
+  %.048.lcssa = phi i32 [ 1, %29 ], [ %.lcssa63, %._crit_edge.loopexit ]
+  %.046.lcssa = phi i64 [ 1, %29 ], [ %.lcssa, %._crit_edge.loopexit ]
+  %34 = add nsw i32 %.048.lcssa, %.04961
+  %35 = icmp ugt i32 %34, 64
+  br i1 %35, label %36, label %40
+
+; <label>:36:                                     ; preds = %._crit_edge
+; This testcase essentially comes down to this little add.
+; If we screw up the revisitation of the users of store of %sink above
+; we will end up propagating and simplifying this to 1 in the final output
+; because we keep an optimistic assumption we should not.
+; CHECK:  add i32 %.05160, 1
+  %37 = add i32 %.05160, 1
+  %38 = icmp ugt i32 %37, 1
+  br i1 %38, label %39, label %40
+
+; <label>:39:                                     ; preds = %36
+  tail call void @Fatal(i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.7, i64 0, i64 0), i32 0)
+  br label %40
+
+; <label>:40:                                     ; preds = %39, %36, %._crit_edge
+  %.152 = phi i32 [ %.05160, %._crit_edge ], [ %37, %39 ], [ %37, %36 ]
+  %.150 = phi i32 [ %.04961, %._crit_edge ], [ 0, %39 ], [ 0, %36 ]
+  %41 = add i64 %.046.lcssa, 4294967295
+  %42 = trunc i64 %41 to i32
+  %43 = getelementptr inbounds [26 x %struct.Letter], [26 x %struct.Letter]* @alPhrase, i64 0, i64 %indvars.iv, i32 2
+  store i32 %42, i32* %43, align 8, !tbaa !12
+  %44 = zext i32 %.150 to i64
+  %.046. = shl i64 %.046.lcssa, %44
+  %45 = zext i32 %.152 to i64
+  %46 = getelementptr inbounds [2 x i64], [2 x i64]* @aqMainSign, i64 0, i64 %45
+  %47 = load i64, i64* %46, align 8, !tbaa !13
+  %48 = or i64 %47, %.046.
+  store i64 %48, i64* %46, align 8, !tbaa !13
+  %49 = load i32, i32* %24, align 16, !tbaa !10
+  %50 = zext i32 %49 to i64
+  %51 = shl i64 %50, %44
+  %52 = getelementptr inbounds [2 x i64], [2 x i64]* @aqMainMask, i64 0, i64 %45
+  %53 = load i64, i64* %52, align 8, !tbaa !13
+  %54 = or i64 %51, %53
+  store i64 %54, i64* %52, align 8, !tbaa !13
+  %55 = getelementptr inbounds [26 x %struct.Letter], [26 x %struct.Letter]* @alPhrase, i64 0, i64 %indvars.iv, i32 1
+  store i32 %.150, i32* %55, align 4, !tbaa !15
+  %56 = getelementptr inbounds [26 x %struct.Letter], [26 x %struct.Letter]* @alPhrase, i64 0, i64 %indvars.iv, i32 3
+  store i32 %.152, i32* %56, align 4, !tbaa !16
+  %57 = add nsw i32 %.150, %.048.lcssa
+  br label %58
+
+; <label>:58:                                     ; preds = %40, %28
+  %.253 = phi i32 [ %.05160, %28 ], [ %.152, %40 ]
+  %.2 = phi i32 [ %.04961, %28 ], [ %57, %40 ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 26
+  br i1 %exitcond, label %.preheader, label %59
+
+; <label>:59:                                     ; preds = %58
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #2
+
+; Function Attrs: inlinehint nounwind readonly uwtable
+declare i32 @tolower(i32) local_unnamed_addr #3
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { inlinehint nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind readnone }
+attributes #5 = { nounwind readonly }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 4.0.0 (http://llvm.org/git/clang.git 9b9db7fa41a1905899dbcbcc6cbdd05d2511da8e) (/Users/dannyb/sources/llvm-clean a3908a41623f6ac14ba8c04613d6c64e0544bb5d)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!3, !3, i64 0}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"any pointer", !3, i64 0}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"short", !3, i64 0}
+!10 = !{!11, !2, i64 0}
+!11 = !{!"", !2, i64 0, !2, i64 4, !2, i64 8, !2, i64 12}
+!12 = !{!11, !2, i64 8}
+!13 = !{!14, !14, i64 0}
+!14 = !{!"long", !3, i64 0}
+!15 = !{!11, !2, i64 4}
+!16 = !{!11, !2, i64 12}
diff --git a/test/Transforms/NewGVN/pr31501.ll b/test/Transforms/NewGVN/pr31501.ll
new file mode 100644
index 000000000000..7122ade56eeb
--- /dev/null
+++ b/test/Transforms/NewGVN/pr31501.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.foo = type { %struct.wombat.28*, %struct.zot, %struct.wombat.28* }
+%struct.zot = type { i64 }
+%struct.barney = type <{ %struct.wombat.28*, %struct.wibble, %struct.snork, %struct.quux.4, %struct.snork.10, %struct.ham.16*, %struct.wobble.23*, i32, i8, i8, [2 x i8] }>
+%struct.wibble = type { %struct.pluto, %struct.bar }
+%struct.pluto = type { %struct.quux }
+%struct.quux = type { %struct.eggs }
+%struct.eggs = type { %struct.zot.0, %struct.widget }
+%struct.zot.0 = type { i8*, i8*, i8* }
+%struct.widget = type { %struct.barney.1 }
+%struct.barney.1 = type { [8 x i8] }
+%struct.bar = type { [3 x %struct.widget] }
+%struct.snork = type <{ %struct.wobble, %struct.bar.3, [7 x i8] }>
+%struct.wobble = type { %struct.wombat }
+%struct.wombat = type { %struct.zot.2 }
+%struct.zot.2 = type { %struct.zot.0, %struct.ham }
+%struct.ham = type { %struct.barney.1 }
+%struct.bar.3 = type { i8 }
+%struct.quux.4 = type <{ %struct.quux.5, %struct.snork.9, [7 x i8] }>
+%struct.quux.5 = type { %struct.widget.6 }
+%struct.widget.6 = type { %struct.spam }
+%struct.spam = type { %struct.zot.0, %struct.ham.7 }
+%struct.ham.7 = type { %struct.barney.8 }
+%struct.barney.8 = type { [24 x i8] }
+%struct.snork.9 = type { i8 }
+%struct.snork.10 = type <{ %struct.foo.11, %struct.spam.15, [7 x i8] }>
+%struct.foo.11 = type { %struct.snork.12 }
+%struct.snork.12 = type { %struct.wombat.13 }
+%struct.wombat.13 = type { %struct.zot.0, %struct.wibble.14 }
+%struct.wibble.14 = type { %struct.barney.8 }
+%struct.spam.15 = type { i8 }
+%struct.ham.16 = type { %struct.pluto.17, %struct.pluto.17 }
+%struct.pluto.17 = type { %struct.bar.18 }
+%struct.bar.18 = type { %struct.baz*, %struct.zot.20, %struct.barney.22 }
+%struct.baz = type { %struct.wibble.19* }
+%struct.wibble.19 = type <{ %struct.baz, %struct.wibble.19*, %struct.baz*, i8, [7 x i8] }>
+%struct.zot.20 = type { %struct.ham.21 }
+%struct.ham.21 = type { %struct.baz }
+%struct.barney.22 = type { %struct.blam }
+%struct.blam = type { i64 }
+%struct.wobble.23 = type { %struct.spam.24, %struct.barney* }
+%struct.spam.24 = type { %struct.bar.25, %struct.zot.26* }
+%struct.bar.25 = type <{ i32 (...)**, i8, i8 }>
+%struct.zot.26 = type { i32 (...)**, i32, %struct.widget.27* }
+%struct.widget.27 = type { %struct.zot.26, %struct.zot.26* }
+%struct.wombat.28 = type <{ i32 (...)**, i8, i8, [6 x i8] }>
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define weak_odr hidden %struct.foo* @quux(%struct.barney* %arg, %struct.wombat.28* %arg1) local_unnamed_addr #0 align 2 {
+; CHECK-LABEL: @quux(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds %struct.barney, %struct.barney* %arg, i64 0, i32 3, i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.spam* [[TMP]] to %struct.foo**
+; CHECK-NEXT:    [[TMP3:%.*]] = load %struct.foo*, %struct.foo** [[TMP2]], align 8, !tbaa !2
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds %struct.barney, %struct.barney* %arg, i64 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8** [[TMP4]] to %struct.foo**
+; CHECK-NEXT:    [[TMP6:%.*]] = load %struct.foo*, %struct.foo** [[TMP5]], align 8, !tbaa !7
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq %struct.foo* [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %bb21, label %bb8
+; CHECK:       bb8:
+; CHECK-NEXT:    br label %bb11
+; CHECK:       bb9:
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq %struct.foo* [[TMP18:%.*]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %bb19, label %bb11
+; CHECK:       bb11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi %struct.foo* [ [[TMP17:%.*]], %bb9 ], [ undef, %bb8 ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi %struct.foo* [ [[TMP18]], %bb9 ], [ [[TMP3]], %bb8 ]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds %struct.foo, %struct.foo* [[TMP13]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = load %struct.wombat.28*, %struct.wombat.28** [[TMP14]], align 8, !tbaa !8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq %struct.wombat.28* [[TMP15]], %arg1
+; CHECK-NEXT:    [[TMP17]] = select i1 [[TMP16]], %struct.foo* [[TMP13]], %struct.foo* [[TMP12]]
+; CHECK-NEXT:    [[TMP18]] = getelementptr inbounds %struct.foo, %struct.foo* [[TMP13]], i64 1
+; CHECK-NEXT:    br i1 [[TMP16]], label %bb19, label %bb9
+; CHECK:       bb19:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi %struct.foo* [ null, %bb9 ], [ [[TMP17]], %bb11 ]
+; CHECK-NEXT:    br label %bb21
+; CHECK:       bb21:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi %struct.foo* [ null, %bb ], [ [[TMP20]], %bb19 ]
+; CHECK-NEXT:    ret %struct.foo* [[TMP22]]
+;
+bb:
+  %tmp = getelementptr inbounds %struct.barney, %struct.barney* %arg, i64 0, i32 3, i32 0, i32 0, i32 0
+  %tmp2 = bitcast %struct.spam* %tmp to %struct.foo**
+  %tmp3 = load %struct.foo*, %struct.foo** %tmp2, align 8, !tbaa !2
+  %tmp4 = getelementptr inbounds %struct.barney, %struct.barney* %arg, i64 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 1
+  %tmp5 = bitcast i8** %tmp4 to %struct.foo**
+  %tmp6 = load %struct.foo*, %struct.foo** %tmp5, align 8, !tbaa !7
+  %tmp7 = icmp eq %struct.foo* %tmp3, %tmp6
+  br i1 %tmp7, label %bb21, label %bb8
+
+bb8:                                              ; preds = %bb
+  br label %bb11
+
+bb9:                                              ; preds = %bb11
+  %tmp10 = icmp eq %struct.foo* %tmp18, %tmp6
+  br i1 %tmp10, label %bb19, label %bb11
+
+bb11:                                             ; preds = %bb9, %bb8
+  %tmp12 = phi %struct.foo* [ %tmp17, %bb9 ], [ undef, %bb8 ]
+  %tmp13 = phi %struct.foo* [ %tmp18, %bb9 ], [ %tmp3, %bb8 ]
+  %tmp14 = getelementptr inbounds %struct.foo, %struct.foo* %tmp13, i64 0, i32 0
+  %tmp15 = load %struct.wombat.28*, %struct.wombat.28** %tmp14, align 8, !tbaa !8
+  %tmp16 = icmp eq %struct.wombat.28* %tmp15, %arg1
+  %tmp17 = select i1 %tmp16, %struct.foo* %tmp13, %struct.foo* %tmp12
+  %tmp18 = getelementptr inbounds %struct.foo, %struct.foo* %tmp13, i64 1
+  br i1 %tmp16, label %bb19, label %bb9
+
+bb19:                                             ; preds = %bb11, %bb9
+  %tmp20 = phi %struct.foo* [ null, %bb9 ], [ %tmp17, %bb11 ]
+  br label %bb21
+
+bb21:                                             ; preds = %bb19, %bb
+  %tmp22 = phi %struct.foo* [ null, %bb ], [ %tmp20, %bb19 ]
+  ret %struct.foo* %tmp22
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 4.0.0 (http://llvm.org/git/clang.git b63fa9e2bb8aac0a80c3e3467991c6b1a4b01e6a) (llvm/trunk 290779)"}
+!2 = !{!3, !4, i64 0}
+!3 = !{!"_ZTSN4llvm15SmallVectorBaseE", !4, i64 0, !4, i64 8, !4, i64 16}
+!4 = !{!"any pointer", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!3, !4, i64 8}
+!8 = !{!9, !4, i64 0}
+!9 = !{!"_ZTSN4llvm9RecordValE", !4, i64 0, !10, i64 8, !4, i64 16}
+!10 = !{!"_ZTSN4llvm14PointerIntPairIPNS_5RecTyELj1EbNS_21PointerLikeTypeTraitsIS2_EENS_18PointerIntPairInfoIS2_Lj1ES4_EEEE", !11, i64 0}
+!11 = !{!"long", !5, i64 0}
diff --git a/test/Transforms/NewGVN/pr31573.ll b/test/Transforms/NewGVN/pr31573.ll
new file mode 100644
index 000000000000..0450b4b1299b
--- /dev/null
+++ b/test/Transforms/NewGVN/pr31573.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @patatino(i8* %blah) {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[MEH:%.*]] = phi i8* [ [[BLAH:%.*]], [[ENTRY:%.*]] ], [ null, [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    switch i32 undef, label [[WHILE_BODY]] [
+; CHECK-NEXT:    i32 666, label [[WHILE_END:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       while.body:
+; CHECK-NEXT:    br label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    store i8 0, i8* [[MEH]], align 1
+; CHECK-NEXT:    store i8 0, i8* [[BLAH]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %meh = phi i8* [ %blah, %entry ], [ null, %while.body ]
+  switch i32 undef, label %while.body [
+  i32 666, label %while.end
+  ]
+
+while.body:
+  br label %while.cond
+
+while.end:
+;; These two stores will initially be considered equivalent, but then proven not.
+;; the second store would previously end up deciding it's equivalent to a previous
+;; store, but it was really just finding an optimistic version of itself
+;; in the congruence class.
+  store i8 0, i8* %meh, align 1
+  store i8 0, i8* %blah, align 1
+  ret void
+}
diff --git a/test/lit.cfg b/test/lit.cfg
index e0881ef21626..e9916b2a60e8 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -231,7 +231,7 @@ config.substitutions.append( ('%ld64', ld64_cmd) )
 config.substitutions.append( ('%ocamlc',
     "%s ocamlc -cclib -L%s %s" %
         (config.ocamlfind_executable, llvm_lib_dir, config.ocaml_flags)) )
-if config.have_ocamlopt in ('1', 'TRUE'):
+if config.have_ocamlopt:
     config.substitutions.append( ('%ocamlopt',
         "%s ocamlopt -cclib -L%s -cclib -Wl,-rpath,%s %s" %
             (config.ocamlfind_executable, llvm_lib_dir, llvm_lib_dir, config.ocaml_flags)) )
@@ -377,6 +377,10 @@ else:
 if loadable_module:
     config.available_features.add('loadable_module')
 
+# Static libraries are not built if BUILD_SHARED_LIBS is ON.
+if not config.build_shared_libs:
+    config.available_features.add("static-libs")
+
 # Sanitizers.
 if 'Address' in config.llvm_use_sanitizer:
     config.available_features.add("asan")
@@ -399,7 +403,7 @@ if lit_config.params.get("run_long_tests", None) == "true":
 if not 'hexagon' in config.target_triple:
     config.available_features.add("object-emission")
 
-if config.have_zlib == "1":
+if config.have_zlib:
     config.available_features.add("zlib")
 else:
     config.available_features.add("nozlib")
@@ -455,7 +459,7 @@ if have_ld_plugin_support():
     config.available_features.add('ld_plugin')
 
 def have_ld64_plugin_support():
-    if (config.llvm_tool_lto_build == 'OFF' or config.ld64_executable == ''):
+    if not config.llvm_tool_lto_build or config.ld64_executable == '':
         return False
 
     ld_cmd = subprocess.Popen([config.ld64_executable, '-v'], stderr = subprocess.PIPE)
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 95f00038bc7e..b6a8b8b17bca 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -15,12 +15,12 @@ config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
 config.python_executable = "@PYTHON_EXECUTABLE@"
 config.gold_executable = "@GOLD_EXECUTABLE@"
 config.ld64_executable = "@LD64_EXECUTABLE@"
-config.llvm_tool_lto_build = "@LLVM_TOOL_LTO_BUILD@"
+config.llvm_tool_lto_build = @LLVM_TOOL_LTO_BUILD@
 config.ocamlfind_executable = "@OCAMLFIND@"
-config.have_ocamlopt = "@HAVE_OCAMLOPT@"
-config.have_ocaml_ounit = "@HAVE_OCAML_OUNIT@"
+config.have_ocamlopt = @HAVE_OCAMLOPT@
+config.have_ocaml_ounit = @HAVE_OCAML_OUNIT@
 config.ocaml_flags = "@OCAMLFLAGS@"
-config.include_go_tests = "@LLVM_INCLUDE_GO_TESTS@"
+config.include_go_tests = @LLVM_INCLUDE_GO_TESTS@
 config.go_executable = "@GO_EXECUTABLE@"
 config.enable_shared = @ENABLE_SHARED@
 config.enable_assertions = @ENABLE_ASSERTIONS@
@@ -32,13 +32,13 @@ config.host_arch = "@HOST_ARCH@"
 config.host_cc = "@HOST_CC@"
 config.host_cxx = "@HOST_CXX@"
 config.host_ldflags = "@HOST_LDFLAGS@"
-config.llvm_use_intel_jitevents = "@LLVM_USE_INTEL_JITEVENTS@"
+config.llvm_use_intel_jitevents = @LLVM_USE_INTEL_JITEVENTS@
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
-config.have_zlib = "@HAVE_LIBZ@"
-config.have_libxar = "@HAVE_LIBXAR@"
+config.have_zlib = @HAVE_LIBZ@
+config.have_libxar = @HAVE_LIBXAR@
 config.have_dia_sdk = @LLVM_ENABLE_DIA_SDK@
-config.enable_ffi = "@LLVM_ENABLE_FFI@"
-config.test_examples = "@ENABLE_EXAMPLES@"
+config.enable_ffi = @LLVM_ENABLE_FFI@
+config.build_shared_libs = @BUILD_SHARED_LIBS@
 
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
diff --git a/test/tools/llvm-config/system-libs.test b/test/tools/llvm-config/system-libs.test
index 6a1f3e39fa1e..9105d242e049 100644
--- a/test/tools/llvm-config/system-libs.test
+++ b/test/tools/llvm-config/system-libs.test
@@ -1,4 +1,5 @@
-RUN: llvm-config --system-libs 2>&1 | FileCheck %s
+RUN: llvm-config --link-static --system-libs 2>&1 | FileCheck %s
+REQUIRES: static-libs
 UNSUPPORTED: system-windows
 CHECK: -l
 CHECK-NOT: error
diff --git a/test/tools/llvm-config/system-libs.windows.test b/test/tools/llvm-config/system-libs.windows.test
index cc976ea34069..2c6e03afa2d9 100644
--- a/test/tools/llvm-config/system-libs.windows.test
+++ b/test/tools/llvm-config/system-libs.windows.test
@@ -1,4 +1,5 @@
-RUN: llvm-config --system-libs 2>&1 | FileCheck %s
+RUN: llvm-config --link-static --system-libs 2>&1 | FileCheck %s
+REQUIRES: static-libs
 REQUIRES: system-windows
 CHECK-NOT: -l
 CHECK: psapi.lib shell32.lib ole32.lib uuid.lib
diff --git a/test/tools/llvm-opt-report/Inputs/dm.c b/test/tools/llvm-opt-report/Inputs/dm.c
new file mode 100644
index 000000000000..b031cd2b0291
--- /dev/null
+++ b/test/tools/llvm-opt-report/Inputs/dm.c
@@ -0,0 +1,13 @@
+void bar(void);
+void foo(int n) {
+  if (n) { bar(); } else { while (1) {} }
+}
+
+void quack(void) {
+  foo(0);
+}
+
+void quack2(void) {
+  foo(4);
+}
+
diff --git a/test/tools/llvm-opt-report/Inputs/dm.yaml b/test/tools/llvm-opt-report/Inputs/dm.yaml
new file mode 100644
index 000000000000..5e6cc54a0a30
--- /dev/null
+++ b/test/tools/llvm-opt-report/Inputs/dm.yaml
@@ -0,0 +1,104 @@
+--- !Missed
+Pass:            inline
+Name:            NoDefinition
+DebugLoc:        { File: Inputs/dm.c, Line: 3, Column: 12 }
+Function:        foo
+Args:            
+  - Callee:          bar
+  - String:          ' will not be inlined into '
+  - Caller:          foo
+    DebugLoc:        { File: Inputs/dm.c, Line: 2, Column: 0 }
+  - String:          ' because its definition is unavailable'
+...
+--- !Analysis
+Pass:            inline
+Name:            CanBeInlined
+DebugLoc:        { File: Inputs/dm.c, Line: 7, Column: 3 }
+Function:        quack
+Args:            
+  - Callee:          foo
+    DebugLoc:        { File: Inputs/dm.c, Line: 2, Column: 0 }
+  - String:          ' can be inlined into '
+  - Caller:          quack
+    DebugLoc:        { File: Inputs/dm.c, Line: 6, Column: 0 }
+  - String:          ' with cost='
+  - Cost:            '-35'
+  - String:          ' (threshold='
+  - Threshold:       '375'
+  - String:          ')'
+...
+--- !Passed
+Pass:            inline
+Name:            Inlined
+DebugLoc:        { File: Inputs/dm.c, Line: 7, Column: 3 }
+Function:        quack
+Args:            
+  - Callee:          foo
+    DebugLoc:        { File: Inputs/dm.c, Line: 2, Column: 0 }
+  - String:          ' inlined into '
+  - Caller:          quack
+    DebugLoc:        { File: Inputs/dm.c, Line: 6, Column: 0 }
+...
+--- !Analysis
+Pass:            inline
+Name:            CanBeInlined
+DebugLoc:        { File: Inputs/dm.c, Line: 11, Column: 3 }
+Function:        quack2
+Args:            
+  - Callee:          foo
+    DebugLoc:        { File: Inputs/dm.c, Line: 2, Column: 0 }
+  - String:          ' can be inlined into '
+  - Caller:          quack2
+    DebugLoc:        { File: Inputs/dm.c, Line: 10, Column: 0 }
+  - String:          ' with cost='
+  - Cost:            '-5'
+  - String:          ' (threshold='
+  - Threshold:       '375'
+  - String:          ')'
+...
+--- !Passed
+Pass:            inline
+Name:            Inlined
+DebugLoc:        { File: Inputs/dm.c, Line: 11, Column: 3 }
+Function:        quack2
+Args:            
+  - Callee:          foo
+    DebugLoc:        { File: Inputs/dm.c, Line: 2, Column: 0 }
+  - String:          ' inlined into '
+  - Caller:          quack2
+    DebugLoc:        { File: Inputs/dm.c, Line: 10, Column: 0 }
+...
+--- !Analysis
+Pass:            loop-vectorize
+Name:            CFGNotUnderstood
+DebugLoc:        { File: Inputs/dm.c, Line: 3, Column: 28 }
+Function:        foo
+Args:            
+  - String:          'loop not vectorized: '
+  - String:          loop control flow is not understood by vectorizer
+...
+--- !Missed
+Pass:            loop-vectorize
+Name:            MissedDetails
+DebugLoc:        { File: Inputs/dm.c, Line: 3, Column: 28 }
+Function:        foo
+Args:            
+  - String:          loop not vectorized
+...
+--- !Analysis
+Pass:            loop-vectorize
+Name:            CFGNotUnderstood
+DebugLoc:        { File: Inputs/dm.c, Line: 3, Column: 28 }
+Function:        quack
+Args:            
+  - String:          'loop not vectorized: '
+  - String:          loop control flow is not understood by vectorizer
+...
+--- !Missed
+Pass:            loop-vectorize
+Name:            MissedDetails
+DebugLoc:        { File: Inputs/dm.c, Line: 3, Column: 28 }
+Function:        quack
+Args:            
+  - String:          loop not vectorized
+...
diff --git a/test/tools/llvm-opt-report/func-dm.test b/test/tools/llvm-opt-report/func-dm.test
new file mode 100644
index 000000000000..133386e2b15b
--- /dev/null
+++ b/test/tools/llvm-opt-report/func-dm.test
@@ -0,0 +1,17 @@
+RUN: llvm-opt-report -r %p %p/Inputs/dm.yaml | FileCheck -strict-whitespace %s
+
+; CHECK: < {{.*[/\]}}dm.c
+; CHECK-NEXT:  1   | void bar(void);
+; CHECK-NEXT:  2   | void foo(int n) {
+; CHECK-NEXT:  3   |   if (n) { bar(); } else { while (1) {} }
+; CHECK-NEXT:  4   | }
+; CHECK-NEXT:  5   | 
+; CHECK-NEXT:  6   | void quack(void) {
+; CHECK-NEXT:  7 I |   foo(0);
+; CHECK-NEXT:  8   | }
+; CHECK-NEXT:  9   | 
+; CHECK-NEXT: 10   | void quack2(void) {
+; CHECK-NEXT: 11 I |   foo(4);
+; CHECK-NEXT: 12   | }
+; CHECK-NEXT: 13   | 
+
diff --git a/tools/llvm-config/llvm-config.cpp b/tools/llvm-config/llvm-config.cpp
index 58571cc59b79..e8afcbaaf485 100644
--- a/tools/llvm-config/llvm-config.cpp
+++ b/tools/llvm-config/llvm-config.cpp
@@ -698,8 +698,12 @@ int main(int argc, char **argv) {
 
     // Print SYSTEM_LIBS after --libs.
     // FIXME: Each LLVM component may have its dependent system libs.
-    if (PrintSystemLibs)
-      OS << LLVM_SYSTEM_LIBS << '\n';
+    if (PrintSystemLibs) {
+      // Output system libraries only if linking against a static
+      // library (since the shared library links to all system libs
+      // already)
+      OS << (LinkMode == LinkModeStatic ? LLVM_SYSTEM_LIBS : "") << '\n';
+    }
   } else if (!Components.empty()) {
     errs() << "llvm-config: error: components given, but unused\n\n";
     usage();
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index c43ccc211811..563084856f6f 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -1780,10 +1780,6 @@ void llvm::ParseInputMachO(StringRef Filename) {
   llvm_unreachable("Input object can't be invalid at this point");
 }
 
-typedef std::pair<uint64_t, const char *> BindInfoEntry;
-typedef std::vector<BindInfoEntry> BindTable;
-typedef BindTable::iterator bind_table_iterator;
-
 // The block of info used by the Symbolizer call backs.
 struct DisassembleInfo {
   bool verbose;
@@ -1797,7 +1793,7 @@ struct DisassembleInfo {
   char *demangled_name;
   uint64_t adrp_addr;
   uint32_t adrp_inst;
-  BindTable *bindtable;
+  std::unique_ptr<SymbolAddressMap> bindtable;
   uint32_t depth;
 };
 
@@ -5311,9 +5307,6 @@ static void printObjc2_64bit_MetaData(MachOObjectFile *O, bool verbose) {
     II = get_section(O, "__DATA", "__objc_imageinfo");
   info.S = II;
   print_image_info64(II, &info);
-
-  if (info.bindtable != nullptr)
-    delete info.bindtable;
 }
 
 static void printObjc2_32bit_MetaData(MachOObjectFile *O, bool verbose) {
@@ -6841,14 +6834,10 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
       free(SymbolizerInfo.method);
     if (SymbolizerInfo.demangled_name != nullptr)
       free(SymbolizerInfo.demangled_name);
-    if (SymbolizerInfo.bindtable != nullptr)
-      delete SymbolizerInfo.bindtable;
     if (ThumbSymbolizerInfo.method != nullptr)
       free(ThumbSymbolizerInfo.method);
     if (ThumbSymbolizerInfo.demangled_name != nullptr)
       free(ThumbSymbolizerInfo.demangled_name);
-    if (ThumbSymbolizerInfo.bindtable != nullptr)
-      delete ThumbSymbolizerInfo.bindtable;
   }
 }
 
@@ -9427,7 +9416,7 @@ void llvm::printMachOWeakBindTable(const object::MachOObjectFile *Obj) {
 static const char *get_dyld_bind_info_symbolname(uint64_t ReferenceValue,
                                                  struct DisassembleInfo *info) {
   if (info->bindtable == nullptr) {
-    info->bindtable = new (BindTable);
+    info->bindtable = llvm::make_unique<SymbolAddressMap>();
     SegInfo sectionTable(info->O);
     for (const llvm::object::MachOBindEntry &Entry : info->O->bindTable()) {
       uint32_t SegIndex = Entry.segmentIndex();
@@ -9435,21 +9424,11 @@ static const char *get_dyld_bind_info_symbolname(uint64_t ReferenceValue,
       if (!sectionTable.isValidSegIndexAndOffset(SegIndex, OffsetInSeg))
         continue;
       uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
-      const char *SymbolName = nullptr;
       StringRef name = Entry.symbolName();
       if (!name.empty())
-        SymbolName = name.data();
-      info->bindtable->push_back(std::make_pair(Address, SymbolName));
-    }
-  }
-  for (bind_table_iterator BI = info->bindtable->begin(),
-                           BE = info->bindtable->end();
-       BI != BE; ++BI) {
-    uint64_t Address = BI->first;
-    if (ReferenceValue == Address) {
-      const char *SymbolName = BI->second;
-      return SymbolName;
+        (*info->bindtable)[Address] = name;
     }
   }
-  return nullptr;
+  auto name = info->bindtable->lookup(ReferenceValue);
+  return !name.empty() ? name.data() : nullptr;
 }
diff --git a/tools/llvm-opt-report/OptReport.cpp b/tools/llvm-opt-report/OptReport.cpp
index d067df5ecad2..4f45dd9f2aa2 100644
--- a/tools/llvm-opt-report/OptReport.cpp
+++ b/tools/llvm-opt-report/OptReport.cpp
@@ -358,7 +358,7 @@ static bool writeReport(LocationInfoTy &LocationInfo) {
         std::map<int, OptReportLocationInfo> ColsInfo;
         unsigned InlinedCols = 0, UnrolledCols = 0, VectorizedCols = 0;
 
-        if (LII != FileInfo.end()) {
+        if (LII != FileInfo.end() && !FuncNameSet.empty()) {
           const auto &LineInfo = LII->second;
 
           for (auto &CI : LineInfo.find(*FuncNameSet.begin())->second) {
@@ -475,13 +475,21 @@ static bool writeReport(LocationInfoTy &LocationInfo) {
       std::map<std::map<int, OptReportLocationInfo>,
                std::set<std::string>> UniqueLIs;
 
+      OptReportLocationInfo AllLI;
       if (LII != FileInfo.end()) {
         const auto &FuncLineInfo = LII->second;
-        for (const auto &FLII : FuncLineInfo)
+        for (const auto &FLII : FuncLineInfo) {
           UniqueLIs[FLII.second].insert(FLII.first);
+
+          for (const auto &OI : FLII.second)
+            AllLI |= OI.second;
+        }
       }
 
-      if (UniqueLIs.size() > 1) {
+      bool NothingHappened = !AllLI.Inlined.Transformed &&
+                             !AllLI.Unrolled.Transformed &&
+                             !AllLI.Vectorized.Transformed;
+      if (UniqueLIs.size() > 1 && !NothingHappened) {
         OS << " [[\n";
         for (const auto &FSLI : UniqueLIs)
           PrintLine(true, FSLI.second);
diff --git a/unittests/ExecutionEngine/Orc/RPCUtilsTest.cpp b/unittests/ExecutionEngine/Orc/RPCUtilsTest.cpp
index 381fd1030422..186c3d408486 100644
--- a/unittests/ExecutionEngine/Orc/RPCUtilsTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RPCUtilsTest.cpp
@@ -108,8 +108,7 @@ namespace rpc {
 } // end namespace orc
 } // end namespace llvm
 
-class DummyRPCAPI {
-public:
+namespace DummyRPCAPI {
 
   class VoidBool : public Function<VoidBool, void(bool)> {
   public:
@@ -135,13 +134,12 @@ public:
     static const char* getName() { return "CustomType"; }
   };
 
-};
+}
 
-class DummyRPCEndpoint : public DummyRPCAPI,
-                         public SingleThreadedRPC<QueueChannel> {
+class DummyRPCEndpoint : public SingleThreadedRPCEndpoint<QueueChannel> {
 public:
   DummyRPCEndpoint(Queue &Q1, Queue &Q2)
-      : SingleThreadedRPC(C, true), C(Q1, Q2) {}
+      : SingleThreadedRPCEndpoint(C, true), C(Q1, Q2) {}
 private:
   QueueChannel C;
 };
@@ -457,3 +455,52 @@ TEST(DummyRPC, TestParallelCallGroup) {
 
   ServerThread.join();
 }
+
+TEST(DummyRPC, TestAPICalls) {
+
+  using DummyCalls1 = APICalls<DummyRPCAPI::VoidBool, DummyRPCAPI::IntInt>;
+  using DummyCalls2 = APICalls<DummyRPCAPI::AllTheTypes>;
+  using DummyCalls3 = APICalls<DummyCalls1, DummyRPCAPI::CustomType>;
+  using DummyCallsAll = APICalls<DummyCalls1, DummyCalls2, DummyRPCAPI::CustomType>;
+
+  static_assert(DummyCalls1::Contains<DummyRPCAPI::VoidBool>::value,
+                "Contains<Func> template should return true here");
+  static_assert(!DummyCalls1::Contains<DummyRPCAPI::CustomType>::value,
+                "Contains<Func> template should return false here");
+
+  Queue Q1, Q2;
+  DummyRPCEndpoint Client(Q1, Q2);
+  DummyRPCEndpoint Server(Q2, Q1);
+
+  std::thread ServerThread(
+    [&]() {
+      Server.addHandler<DummyRPCAPI::VoidBool>([](bool b) { });
+      Server.addHandler<DummyRPCAPI::IntInt>([](int x) { return x; });
+      Server.addHandler<DummyRPCAPI::CustomType>([](RPCFoo F) {});
+
+      for (unsigned I = 0; I < 4; ++I) {
+        auto Err = Server.handleOne();
+        (void)!!Err;
+      }
+    });
+
+  {
+    auto Err = DummyCalls1::negotiate(Client);
+    EXPECT_FALSE(!!Err) << "DummyCalls1::negotiate failed";
+  }
+
+  {
+    auto Err = DummyCalls3::negotiate(Client);
+    EXPECT_FALSE(!!Err) << "DummyCalls3::negotiate failed";
+  }
+
+  {
+    auto Err = DummyCallsAll::negotiate(Client);
+    EXPECT_EQ(errorToErrorCode(std::move(Err)).value(),
+              static_cast<int>(OrcErrorCode::UnknownRPCFunction))
+      << "Expected 'UnknownRPCFunction' error for attempted negotiate of "
+         "unsupported function";
+  }
+
+  ServerThread.join();
+}
diff --git a/unittests/IR/UserTest.cpp b/unittests/IR/UserTest.cpp
index 7d875aa80d48..7ba6840313b1 100644
--- a/unittests/IR/UserTest.cpp
+++ b/unittests/IR/UserTest.cpp
@@ -84,6 +84,22 @@ TEST(UserTest, ValueOpIteration) {
   EXPECT_FALSE(P.value_op_begin() >= P.value_op_end());
   EXPECT_EQ(10, std::distance(P.value_op_begin(), P.value_op_end()));
 
+  // const value op iteration
+  const PHINode *IP = &P;
+  EXPECT_TRUE(IP->value_op_begin() == IP->value_op_begin());
+  EXPECT_FALSE(IP->value_op_begin() == IP->value_op_end());
+  EXPECT_TRUE(IP->value_op_begin() != IP->value_op_end());
+  EXPECT_FALSE(IP->value_op_end() != IP->value_op_end());
+  EXPECT_TRUE(IP->value_op_begin() < IP->value_op_end());
+  EXPECT_FALSE(IP->value_op_begin() < IP->value_op_begin());
+  EXPECT_TRUE(IP->value_op_end() > IP->value_op_begin());
+  EXPECT_FALSE(IP->value_op_begin() > IP->value_op_begin());
+  EXPECT_TRUE(IP->value_op_begin() <= IP->value_op_begin());
+  EXPECT_FALSE(IP->value_op_end() <= IP->value_op_begin());
+  EXPECT_TRUE(IP->value_op_begin() >= IP->value_op_begin());
+  EXPECT_FALSE(IP->value_op_begin() >= IP->value_op_end());
+  EXPECT_EQ(10, std::distance(IP->value_op_begin(), IP->value_op_end()));
+
   User::value_op_iterator I = P.value_op_begin();
   I += 3;
   EXPECT_EQ(std::next(P.value_op_begin(), 3), I);
@@ -91,6 +107,15 @@ TEST(UserTest, ValueOpIteration) {
   I++;
   EXPECT_EQ(P.getOperand(6), I[2]);
   EXPECT_EQ(P.value_op_end(), (I - 2) + 8);
+
+  // const value op
+  User::const_value_op_iterator CI = IP->value_op_begin();
+  CI += 3;
+  EXPECT_EQ(std::next(IP->value_op_begin(), 3), CI);
+  EXPECT_EQ(IP->getOperand(3), *CI);
+  CI++;
+  EXPECT_EQ(IP->getOperand(6), CI[2]);
+  EXPECT_EQ(IP->value_op_end(), (CI - 2) + 8);
 }
 
 TEST(UserTest, PersonalityUser) {
diff --git a/utils/unittest/CMakeLists.txt b/utils/unittest/CMakeLists.txt
index dadca65b3aee..16a354512741 100644
--- a/utils/unittest/CMakeLists.txt
+++ b/utils/unittest/CMakeLists.txt
@@ -21,10 +21,10 @@ if(WIN32)
   add_definitions(-DGTEST_OS_WINDOWS=1)
 endif()
 
-if(SUPPORTS_NO_VARIADIC_MACROS_FLAG)
+if(SUPPORTS_VARIADIC_MACROS_FLAG)
   add_definitions("-Wno-variadic-macros")
 endif()
-if(SUPPORTS_NO_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
+if(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
   add_definitions("-Wno-gnu-zero-variadic-macro-arguments")
 endif()
 if(CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG)
diff --git a/utils/update_test_checks.py b/utils/update_test_checks.py
index a7529af0109a..92ab5ef6599c 100755
--- a/utils/update_test_checks.py
+++ b/utils/update_test_checks.py
@@ -64,11 +64,13 @@ LLC_FUNCTION_RE = re.compile(
     flags=(re.M | re.S))
 OPT_FUNCTION_RE = re.compile(
     r'^\s*define\s+(?:internal\s+)?[^@]*@(?P<func>[\w-]+?)\s*\('
-    r'(\s+)?[^{]*\{\n(?P<body>.*?)^\}$',
+    r'(\s+)?[^)]*[^{]*\{\n(?P<body>.*?)^\}$',
     flags=(re.M | re.S))
 CHECK_PREFIX_RE = re.compile('--check-prefix=(\S+)')
 CHECK_RE = re.compile(r'^\s*;\s*([^:]+?)(?:-NEXT|-NOT|-DAG|-LABEL)?:')
-IR_VALUE_DEF_RE = re.compile(r'\s+%(.*) =')
+# Match things that look at identifiers, but only if they are followed by
+# spaces, commas, paren, or end of the string
+IR_VALUE_RE = re.compile(r'(\s+)%(.+?)([,\s\(\)]|\Z)')
 
 
 # Invoke the tool that is being tested.
@@ -156,33 +158,34 @@ def get_value_definition(var):
 def get_value_use(var):
   return '[[' + get_value_name(var) + ']]'
 
-
 # Replace IR value defs and uses with FileCheck variables.
 def genericize_check_lines(lines):
+  # This gets called for each match that occurs in
+  # a line. We transform variables we haven't seen
+  # into defs, and variables we have seen into uses.
+  def transform_line_vars(match):
+    var = match.group(2)
+    if var in vars_seen:
+      rv = get_value_use(var)
+    else:
+      vars_seen.add(var)
+      rv = get_value_definition(var)
+    # re.sub replaces the entire regex match
+    # with whatever you return, so we have
+    # to make sure to hand it back everything
+    # including the commas and spaces.
+    return match.group(1) + rv + match.group(3)
+
+  vars_seen = set()
   lines_with_def = []
-  vars_seen = []
-  for line in lines:
+
+  for i, line in enumerate(lines):
     # An IR variable named '%.' matches the FileCheck regex string.
     line = line.replace('%.', '%dot')
-    m = IR_VALUE_DEF_RE.match(line)
-    if m:
-      vars_seen.append(m.group(1))
-      line = line.replace('%' + m.group(1), get_value_definition(m.group(1)))
-
-    lines_with_def.append(line)
-
-  # A single def isn't worth replacing?
-  #if len(vars_seen) < 2:
-  #  return lines
-
-  output_lines = []
-  vars_seen.sort(key=len, reverse=True)
-  for line in lines_with_def:
-    for var in vars_seen:
-      line = line.replace('%' + var, get_value_use(var))
-    output_lines.append(line)
-
-  return output_lines
+    # Ignore any comments, since the check lines will too.
+    scrubbed_line = SCRUB_IR_COMMENT_RE.sub(r'', line)
+    lines[i] =  IR_VALUE_RE.sub(transform_line_vars, scrubbed_line)
+  return lines
 
 
 def add_checks(output_lines, prefix_list, func_dict, func_name, tool_basename):
-- 
cgit v1.2.3


From fdc82ccb3f2b23a89e7002fe8238e1422b00f96a Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Mon, 9 Jan 2017 21:23:21 +0000
Subject: Vendor import of clang trunk r291476:
 https://llvm.org/svn/llvm-project/cfe/trunk@291476

---
 docs/ReleaseNotes.rst                              |   4 +
 include/clang/AST/Expr.h                           |   3 +-
 include/clang/Basic/Attr.td                        |  27 +-
 include/clang/Basic/AttrDocs.td                    |  59 ++
 include/clang/Basic/DiagnosticCommonKinds.td       |   2 +
 include/clang/Basic/DiagnosticGroups.td            |   4 +-
 include/clang/Basic/DiagnosticSemaKinds.td         |  15 +-
 include/clang/Basic/LangOptions.def                |   1 +
 include/clang/Driver/CC1Options.td                 |   3 +
 include/clang/Frontend/CodeGenOptions.def          |   1 +
 include/clang/Frontend/FrontendActions.h           |   2 +
 include/clang/Index/IndexSymbol.h                  |   9 +
 include/clang/Sema/Initialization.h                |   4 +-
 include/clang/Sema/Overload.h                      | 141 +++-
 include/clang/Sema/Sema.h                          | 120 ++-
 include/clang/StaticAnalyzer/Checkers/Checkers.td  |   8 +
 lib/AST/ExprConstant.cpp                           |  84 +-
 lib/AST/MicrosoftMangle.cpp                        |  18 +-
 lib/CodeGen/BackendUtil.cpp                        |  59 +-
 lib/CodeGen/CGCleanup.h                            |   2 +
 lib/CodeGen/CGException.cpp                        |   8 +
 lib/CodeGen/CodeGenFunction.cpp                    |   3 +
 lib/Driver/ToolChains.cpp                          |   1 +
 lib/Driver/Tools.cpp                               |   1 +
 lib/Format/TokenAnnotator.cpp                      |  10 +-
 lib/Format/UnwrappedLineParser.cpp                 |  18 +-
 lib/Frontend/CompilerInvocation.cpp                |   1 +
 lib/Frontend/FrontendActions.cpp                   |   6 +
 lib/Index/IndexSymbol.cpp                          |  18 +-
 lib/Lex/PPDirectives.cpp                           |  17 +
 lib/Parse/ParseDecl.cpp                            |   7 +-
 lib/Parse/ParseInit.cpp                            |   4 +
 lib/Sema/SemaChecking.cpp                          |   3 +-
 lib/Sema/SemaDeclAttr.cpp                          | 109 ++-
 lib/Sema/SemaDeclCXX.cpp                           |  52 +-
 lib/Sema/SemaExpr.cpp                              | 284 +++----
 lib/Sema/SemaExprMember.cpp                        |   1 +
 lib/Sema/SemaInit.cpp                              |   7 +
 lib/Sema/SemaLambda.cpp                            |   3 +-
 lib/Sema/SemaLookup.cpp                            |   3 +-
 lib/Sema/SemaOverload.cpp                          | 632 ++++++++++++----
 lib/Sema/SemaTemplate.cpp                          |   5 +
 lib/Sema/SemaTemplateDeduction.cpp                 | 138 ++--
 lib/Sema/SemaTemplateInstantiate.cpp               |  75 +-
 lib/Sema/SemaTemplateInstantiateDecl.cpp           |  64 +-
 lib/Serialization/ASTWriter.cpp                    |  11 -
 lib/StaticAnalyzer/Checkers/CMakeLists.txt         |   1 +
 .../Checkers/IteratorPastEndChecker.cpp            | 842 +++++++++++++++++++++
 lib/StaticAnalyzer/Core/ExprEngine.cpp             |   9 +-
 test/Analysis/Inputs/system-header-simulator-cxx.h |  64 +-
 test/Analysis/diagnostics/explicit-suppression.cpp |   2 +-
 test/Analysis/inlining/stl.cpp                     |   3 +-
 test/Analysis/iterator-past-end.cpp                | 205 +++++
 .../basic.namespace/namespace.udecl/p15.cpp        |  30 +-
 test/CXX/drs/dr13xx.cpp                            | 130 ++++
 test/CXX/drs/dr19xx.cpp                            |   9 +-
 .../expr/expr.prim/expr.prim.lambda/templates.cpp  |   4 +-
 .../temp.deduct/temp.deduct.call/p1-0x.cpp         |  13 +-
 test/CXX/temp/temp.param/p5.cpp                    |  10 +-
 test/CodeGen/lifetime2.c                           |   2 +
 test/CodeGen/thinlto_backend.ll                    |   8 +
 test/CodeGenCXX/arm.cpp                            |   9 +-
 test/CodeGenCXX/debug-info-class.cpp               |  15 +-
 test/CodeGenCXX/dllexport-ctor-closure.cpp         |  82 ++
 test/CodeGenCXX/dllexport.cpp                      |  51 --
 test/CodeGenCXX/eh-aggregate-copy-destroy.cpp      |   6 +-
 test/CodeGenCXX/exceptions.cpp                     | 104 ++-
 test/CodeGenCXX/goto.cpp                           |   7 +-
 test/Driver/B-opt.c                                |   8 +-
 test/Driver/coverage-ld.c                          |   8 +-
 test/Driver/cross-linux.c                          |  18 +-
 test/Driver/fuchsia.c                              |   2 +-
 test/Driver/fuchsia.cpp                            |   2 +-
 test/Driver/fuse-ld.c                              |   4 +-
 test/Driver/instrprof-ld.c                         |  24 +-
 test/Driver/mips-mti-linux.c                       |   4 +-
 test/Driver/netbsd.c                               |  36 +
 test/Driver/netbsd.cpp                             |  40 +
 test/Driver/nostdlib.c                             |   4 +-
 test/Driver/prefixed-tools.c                       |   4 +-
 test/Driver/sanitizer-ld.c                         |  94 +--
 test/Driver/windows-cross.c                        |  14 +-
 test/Index/Core/index-source.cpp                   |  11 +
 test/Misc/diag-template-diffing.cpp                |   6 +-
 test/Modules/Inputs/pch-with-module-name/A.h       |   1 +
 test/Modules/Inputs/pch-with-module-name/C.h       |   1 +
 test/Modules/Inputs/pch-with-module-name/C.m       |   1 +
 test/Modules/Inputs/pch-with-module-name/D.h       |   1 +
 .../Inputs/pch-with-module-name/module.modulemap   |   9 +
 test/Modules/Inputs/pch-with-module-name/test.h    |   1 +
 test/Modules/pch-with-module-name.m                |   5 +
 test/OpenMP/atomic_codegen.cpp                     |  11 +-
 test/OpenMP/threadprivate_codegen.cpp              |   2 +-
 test/Sema/diagnose_if.c                            | 152 ++++
 test/SemaCXX/PR10177.cpp                           |   9 +-
 test/SemaCXX/attr-mode-tmpl.cpp                    |   2 +-
 test/SemaCXX/attr-noreturn.cpp                     |  10 +-
 test/SemaCXX/constant-expression-cxx11.cpp         |   4 +-
 test/SemaCXX/cxx1z-constexpr-lambdas.cpp           | 114 +++
 test/SemaCXX/diagnose_if.cpp                       | 460 +++++++++++
 test/SemaCXX/enable_if.cpp                         |   8 +
 test/SemaCXX/implicit-exception-spec.cpp           |  32 +-
 test/SemaCXX/libstdcxx_gets_hack.cpp               |  28 +
 test/SemaCXX/member-init.cpp                       |  24 +-
 test/SemaCXX/overload-call.cpp                     |   2 +-
 test/SemaCXX/overload-member-call.cpp              |   2 +-
 test/SemaCXX/undefined-internal.cpp                |  11 +-
 test/SemaTemplate/alias-templates.cpp              |  10 +
 test/SemaTemplate/constexpr-instantiate.cpp        |  91 ++-
 test/SemaTemplate/deduction.cpp                    |  30 +-
 test/SemaTemplate/default-arguments-cxx0x.cpp      |   2 +
 test/SemaTemplate/instantiate-init.cpp             |   3 +-
 test/SemaTemplate/temp_arg_nontype.cpp             |   8 +-
 tools/c-index-test/core_main.cpp                   |   2 +
 tools/driver/CMakeLists.txt                        |  20 +-
 unittests/Format/FormatTest.cpp                    |   4 +
 unittests/Format/FormatTestJS.cpp                  |  20 +
 www/cxx_dr_status.html                             |   8 +-
 118 files changed, 4186 insertions(+), 834 deletions(-)
 create mode 100644 lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp
 create mode 100644 test/Analysis/iterator-past-end.cpp
 create mode 100644 test/CodeGenCXX/dllexport-ctor-closure.cpp
 create mode 100644 test/Modules/Inputs/pch-with-module-name/A.h
 create mode 100644 test/Modules/Inputs/pch-with-module-name/C.h
 create mode 100644 test/Modules/Inputs/pch-with-module-name/C.m
 create mode 100644 test/Modules/Inputs/pch-with-module-name/D.h
 create mode 100644 test/Modules/Inputs/pch-with-module-name/module.modulemap
 create mode 100644 test/Modules/Inputs/pch-with-module-name/test.h
 create mode 100644 test/Modules/pch-with-module-name.m
 create mode 100644 test/Sema/diagnose_if.c
 create mode 100644 test/SemaCXX/diagnose_if.cpp
 create mode 100644 test/SemaCXX/libstdcxx_gets_hack.cpp

diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 11ee88f4ae20..b15d9a5a147c 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -47,6 +47,10 @@ sections with improvements to Clang's support for those languages.
 Major New Features
 ------------------
 
+- The ``diagnose_if`` attribute has been added to clang. This attribute allows
+  clang to emit a warning or error if a function call meets one or more
+  user-specified conditions.
+
 -  ...
 
 Improvements to Clang's diagnostics
diff --git a/include/clang/AST/Expr.h b/include/clang/AST/Expr.h
index 41ae6d2b721e..56b99ccd8971 100644
--- a/include/clang/AST/Expr.h
+++ b/include/clang/AST/Expr.h
@@ -651,7 +651,8 @@ public:
   /// constant.
   bool EvaluateWithSubstitution(APValue &Value, ASTContext &Ctx,
                                 const FunctionDecl *Callee,
-                                ArrayRef<const Expr*> Args) const;
+                                ArrayRef<const Expr*> Args,
+                                const Expr *This = nullptr) const;
 
   /// \brief If the current Expr is a pointer, this will try to statically
   /// determine the number of bytes available where the pointer is pointing.
diff --git a/include/clang/Basic/Attr.td b/include/clang/Basic/Attr.td
index e3c2b0e45d3d..fa60d512a6ff 100644
--- a/include/clang/Basic/Attr.td
+++ b/include/clang/Basic/Attr.td
@@ -140,12 +140,15 @@ class Argument<string name, bit optional, bit fake = 0> {
   bit Fake = fake;
 }
 
-class BoolArgument<string name, bit opt = 0> : Argument<name, opt>;
+class BoolArgument<string name, bit opt = 0, bit fake = 0> : Argument<name, opt,
+                                                                      fake>;
 class IdentifierArgument<string name, bit opt = 0> : Argument<name, opt>;
 class IntArgument<string name, bit opt = 0> : Argument<name, opt>;
 class StringArgument<string name, bit opt = 0> : Argument<name, opt>;
 class ExprArgument<string name, bit opt = 0> : Argument<name, opt>;
-class FunctionArgument<string name, bit opt = 0> : Argument<name, opt>;
+class FunctionArgument<string name, bit opt = 0, bit fake = 0> : Argument<name,
+                                                                          opt,
+                                                                          fake>;
 class TypeArgument<string name, bit opt = 0> : Argument<name, opt>;
 class UnsignedArgument<string name, bit opt = 0> : Argument<name, opt>;
 class VariadicUnsignedArgument<string name> : Argument<name, 1>;
@@ -1591,6 +1594,26 @@ def Unavailable : InheritableAttr {
   let Documentation = [Undocumented];
 }
 
+def DiagnoseIf : InheritableAttr {
+  let Spellings = [GNU<"diagnose_if">];
+  let Subjects = SubjectList<[Function]>;
+  let Args = [ExprArgument<"Cond">, StringArgument<"Message">,
+              EnumArgument<"DiagnosticType",
+                           "DiagnosticType",
+                           ["error", "warning"],
+                           ["DT_Error", "DT_Warning"]>,
+              BoolArgument<"ArgDependent", 0, /*fake*/ 1>,
+              FunctionArgument<"Parent", 0, /*fake*/ 1>];
+  let DuplicatesAllowedWhileMerging = 1;
+  let LateParsed = 1;
+  let AdditionalMembers = [{
+    bool isError() const { return diagnosticType == DT_Error; }
+    bool isWarning() const { return diagnosticType == DT_Warning; }
+  }];
+  let TemplateDependent = 1;
+  let Documentation = [DiagnoseIfDocs];
+}
+
 def ArcWeakrefUnavailable : InheritableAttr {
   let Spellings = [GNU<"objc_arc_weak_reference_unavailable">];
   let Subjects = SubjectList<[ObjCInterface], ErrorDiag>;
diff --git a/include/clang/Basic/AttrDocs.td b/include/clang/Basic/AttrDocs.td
index b57833a15f31..49b0a533cec3 100644
--- a/include/clang/Basic/AttrDocs.td
+++ b/include/clang/Basic/AttrDocs.td
@@ -378,6 +378,65 @@ template instantiation, so the value for ``T::number`` is known.
   }];
 }
 
+def DiagnoseIfDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``diagnose_if`` attribute can be placed on function declarations to emit
+warnings or errors at compile-time if calls to the attributed function meet
+certain user-defined criteria. For example:
+
+.. code-block:: c
+  void abs(int a)
+    __attribute__((diagnose_if(a >= 0, "Redundant abs call", "warning")));
+  void must_abs(int a)
+    __attribute__((diagnose_if(a >= 0, "Redundant abs call", "error")));
+
+  int val = abs(1); // warning: Redundant abs call
+  int val2 = must_abs(1); // error: Redundant abs call
+  int val3 = abs(val);
+  int val4 = must_abs(val); // Because run-time checks are not emitted for
+                            // diagnose_if attributes, this executes without
+                            // issue.
+
+
+``diagnose_if`` is closely related to ``enable_if``, with a few key differences:
+
+* Overload resolution is not aware of ``diagnose_if`` attributes: they're
+  considered only after we select the best candidate from a given candidate set.
+* Function declarations that differ only in their ``diagnose_if`` attributes are
+  considered to be redeclarations of the same function (not overloads).
+* If the condition provided to ``diagnose_if`` cannot be evaluated, no
+  diagnostic will be emitted.
+
+Otherwise, ``diagnose_if`` is essentially the logical negation of ``enable_if``.
+
+As a result of bullet number two, ``diagnose_if`` attributes will stack on the
+same function. For example:
+
+.. code-block:: c
+
+  int foo() __attribute__((diagnose_if(1, "diag1", "warning")));
+  int foo() __attribute__((diagnose_if(1, "diag2", "warning")));
+
+  int bar = foo(); // warning: diag1
+                   // warning: diag2
+  int (*fooptr)(void) = foo; // warning: diag1
+                             // warning: diag2
+
+  constexpr int supportsAPILevel(int N) { return N < 5; }
+  int baz(int a)
+    __attribute__((diagnose_if(!supportsAPILevel(10),
+                               "Upgrade to API level 10 to use baz", "error")));
+  int baz(int a)
+    __attribute__((diagnose_if(!a, "0 is not recommended.", "warning")));
+
+  int (*bazptr)(int) = baz; // error: Upgrade to API level 10 to use baz
+  int v = baz(0); // error: Upgrade to API level 10 to use baz
+
+Query for this feature with ``__has_attribute(diagnose_if)``.
+  }];
+}
+
 def PassObjectSizeDocs : Documentation {
   let Category = DocCatVariable; // Technically it's a parameter doc, but eh.
   let Content = [{
diff --git a/include/clang/Basic/DiagnosticCommonKinds.td b/include/clang/Basic/DiagnosticCommonKinds.td
index e8180eb1db48..af0612a829e1 100644
--- a/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/include/clang/Basic/DiagnosticCommonKinds.td
@@ -161,6 +161,8 @@ def ext_old_implicitly_unsigned_long_cxx : ExtWarn<
   InGroup<CXX11Compat>;
 def ext_clang_enable_if : Extension<"'enable_if' is a clang extension">,
                           InGroup<GccCompat>;
+def ext_clang_diagnose_if : Extension<"'diagnose_if' is a clang extension">,
+                            InGroup<GccCompat>;
 
 // SEH
 def err_seh_expected_handler : Error<
diff --git a/include/clang/Basic/DiagnosticGroups.td b/include/clang/Basic/DiagnosticGroups.td
index ba82b36cea31..4173d03de9f0 100644
--- a/include/clang/Basic/DiagnosticGroups.td
+++ b/include/clang/Basic/DiagnosticGroups.td
@@ -495,6 +495,7 @@ def UnusedPropertyIvar :  DiagGroup<"unused-property-ivar">;
 def UnusedGetterReturnValue : DiagGroup<"unused-getter-return-value">;
 def UsedButMarkedUnused : DiagGroup<"used-but-marked-unused">;
 def UserDefinedLiterals : DiagGroup<"user-defined-literals">;
+def UserDefinedWarnings : DiagGroup<"user-defined-warnings">;
 def Reorder : DiagGroup<"reorder">;
 def UndeclaredSelector : DiagGroup<"undeclared-selector">;
 def ImplicitAtomic : DiagGroup<"implicit-atomic-properties">;
@@ -683,7 +684,8 @@ def Most : DiagGroup<"most", [
     OverloadedVirtual,
     PrivateExtern,
     SelTypeCast,
-    ExternCCompat
+    ExternCCompat,
+    UserDefinedWarnings
  ]>;
 
 // Thread Safety warnings 
diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td
index 0807bba45fc4..6a8933f23ecd 100644
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2141,8 +2141,11 @@ def err_constexpr_local_var_no_init : Error<
 def ext_constexpr_function_never_constant_expr : ExtWarn<
   "constexpr %select{function|constructor}0 never produces a "
   "constant expression">, InGroup<DiagGroup<"invalid-constexpr">>, DefaultError;
-def err_enable_if_never_constant_expr : Error<
-  "'enable_if' attribute expression never produces a constant expression">;
+def err_attr_cond_never_constant_expr : Error<
+  "%0 attribute expression never produces a constant expression">;
+def err_diagnose_if_invalid_diagnostic_type : Error<
+  "invalid diagnostic type for 'diagnose_if'; use \"error\" or \"warning\" "
+  "instead">;
 def err_constexpr_body_no_return : Error<
   "no return statement in constexpr function">;
 def err_constexpr_return_missing_expr : Error<
@@ -3333,6 +3336,9 @@ def note_ovl_candidate : Note<"candidate "
 
 def note_ovl_candidate_inherited_constructor : Note<
     "constructor from base class %0 inherited here">;
+def note_ovl_candidate_inherited_constructor_slice : Note<
+    "constructor inherited from base class cannot be used to initialize from "
+    "an argument of the derived class type">;
 def note_ovl_candidate_illegal_constructor : Note<
     "candidate %select{constructor|template}0 ignored: "
     "instantiation %select{takes|would take}0 its own class type by value">;
@@ -3366,7 +3372,9 @@ def note_ovl_candidate_disabled_by_enable_if : Note<
 def note_ovl_candidate_has_pass_object_size_params: Note<
     "candidate address cannot be taken because parameter %0 has "
     "pass_object_size attribute">;
-def note_ovl_candidate_disabled_by_enable_if_attr : Note<
+def err_diagnose_if_succeeded : Error<"%0">;
+def warn_diagnose_if_succeeded : Warning<"%0">, InGroup<UserDefinedWarnings>;
+def note_ovl_candidate_disabled_by_function_cond_attr : Note<
     "candidate disabled: %0">;
 def note_ovl_candidate_disabled_by_extension : Note<
     "candidate disabled due to OpenCL extension">;
@@ -4395,6 +4403,7 @@ def note_not_found_by_two_phase_lookup : Note<"%0 should be declared prior to th
 def err_undeclared_use : Error<"use of undeclared %0">;
 def warn_deprecated : Warning<"%0 is deprecated">,
     InGroup<DeprecatedDeclarations>;
+def note_from_diagnose_if : Note<"from 'diagnose_if' attribute on %0:">;
 def warn_property_method_deprecated :
     Warning<"property access is using %0 method which is deprecated">,
     InGroup<DeprecatedDeclarations>;
diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def
index 47db50c52b74..d944a9d78ab9 100644
--- a/include/clang/Basic/LangOptions.def
+++ b/include/clang/Basic/LangOptions.def
@@ -146,6 +146,7 @@ LANGOPT(Modules           , 1, 0, "modules extension to C")
 COMPATIBLE_LANGOPT(ModulesTS  , 1, 0, "C++ Modules TS")
 BENIGN_ENUM_LANGOPT(CompilingModule, CompilingModuleKind, 2, CMK_None,
                     "compiling a module interface")
+BENIGN_LANGOPT(CompilingPCH, 1, 0, "building a pch")
 COMPATIBLE_LANGOPT(ModulesDeclUse    , 1, 0, "require declaration of module uses")
 BENIGN_LANGOPT(ModulesSearchAll  , 1, 1, "searching even non-imported modules to find unresolved references")
 COMPATIBLE_LANGOPT(ModulesStrictDeclUse, 1, 0, "requiring declaration of module uses and all headers to be in modules")
diff --git a/include/clang/Driver/CC1Options.td b/include/clang/Driver/CC1Options.td
index 2910b8521bab..ab296ebb9f6a 100644
--- a/include/clang/Driver/CC1Options.td
+++ b/include/clang/Driver/CC1Options.td
@@ -167,6 +167,9 @@ def disable_llvm_passes : Flag<["-"], "disable-llvm-passes">,
            "frontend by not running any LLVM passes at all">;
 def disable_llvm_optzns : Flag<["-"], "disable-llvm-optzns">,
   Alias<disable_llvm_passes>;
+def disable_lifetimemarkers : Flag<["-"], "disable-lifetime-markers">,
+  HelpText<"Disable lifetime-markers emission even when optimizations are "
+           "enabled">;
 def disable_red_zone : Flag<["-"], "disable-red-zone">,
   HelpText<"Do not emit code that uses the red zone.">;
 def dwarf_column_info : Flag<["-"], "dwarf-column-info">,
diff --git a/include/clang/Frontend/CodeGenOptions.def b/include/clang/Frontend/CodeGenOptions.def
index 54c9f81265a6..964a6cc2a007 100644
--- a/include/clang/Frontend/CodeGenOptions.def
+++ b/include/clang/Frontend/CodeGenOptions.def
@@ -52,6 +52,7 @@ CODEGENOPT(DisableGCov       , 1, 0) ///< Don't run the GCov pass, for testing.
 CODEGENOPT(DisableLLVMPasses , 1, 0) ///< Don't run any LLVM IR passes to get
                                      ///< the pristine IR generated by the
                                      ///< frontend.
+CODEGENOPT(DisableLifetimeMarkers, 1, 0) ///< Don't emit any lifetime markers
 CODEGENOPT(ExperimentalNewPassManager, 1, 0) ///< Enables the new, experimental
                                              ///< pass manager.
 CODEGENOPT(DisableRedZone    , 1, 0) ///< Set when -mno-red-zone is enabled.
diff --git a/include/clang/Frontend/FrontendActions.h b/include/clang/Frontend/FrontendActions.h
index a073ca5bfd2a..20fddc4d5a52 100644
--- a/include/clang/Frontend/FrontendActions.h
+++ b/include/clang/Frontend/FrontendActions.h
@@ -88,6 +88,8 @@ public:
   static std::unique_ptr<raw_pwrite_stream>
   ComputeASTConsumerArguments(CompilerInstance &CI, StringRef InFile,
                               std::string &Sysroot, std::string &OutputFile);
+
+  bool BeginSourceFileAction(CompilerInstance &CI, StringRef Filename) override;
 };
 
 class GenerateModuleAction : public ASTFrontendAction {
diff --git a/include/clang/Index/IndexSymbol.h b/include/clang/Index/IndexSymbol.h
index cac0b53a939e..559b212b9266 100644
--- a/include/clang/Index/IndexSymbol.h
+++ b/include/clang/Index/IndexSymbol.h
@@ -59,6 +59,13 @@ enum class SymbolLanguage {
   CXX,
 };
 
+/// Language specific sub-kinds.
+enum class SymbolSubKind {
+  None,
+  CXXCopyConstructor,
+  CXXMoveConstructor,
+};
+
 /// Set of properties that provide additional info about a symbol.
 enum class SymbolProperty : uint8_t {
   Generic                       = 1 << 0,
@@ -107,6 +114,7 @@ struct SymbolRelation {
 
 struct SymbolInfo {
   SymbolKind Kind;
+  SymbolSubKind SubKind;
   SymbolPropertySet Properties;
   SymbolLanguage Lang;
 };
@@ -121,6 +129,7 @@ void printSymbolRoles(SymbolRoleSet Roles, raw_ostream &OS);
 bool printSymbolName(const Decl *D, const LangOptions &LO, raw_ostream &OS);
 
 StringRef getSymbolKindString(SymbolKind K);
+StringRef getSymbolSubKindString(SymbolSubKind K);
 StringRef getSymbolLanguageString(SymbolLanguage K);
 
 void applyForEachSymbolProperty(SymbolPropertySet Props,
diff --git a/include/clang/Sema/Initialization.h b/include/clang/Sema/Initialization.h
index a7b8cce32691..94be58ac8aeb 100644
--- a/include/clang/Sema/Initialization.h
+++ b/include/clang/Sema/Initialization.h
@@ -215,14 +215,14 @@ public:
 
   /// \brief Create the initialization entity for a parameter.
   static InitializedEntity InitializeParameter(ASTContext &Context,
-                                               ParmVarDecl *Parm) {
+                                               const ParmVarDecl *Parm) {
     return InitializeParameter(Context, Parm, Parm->getType());
   }
 
   /// \brief Create the initialization entity for a parameter, but use
   /// another type.
   static InitializedEntity InitializeParameter(ASTContext &Context,
-                                               ParmVarDecl *Parm,
+                                               const ParmVarDecl *Parm,
                                                QualType Type) {
     bool Consumed = (Context.getLangOpts().ObjCAutoRefCount &&
                      Parm->hasAttr<NSConsumedAttr>());
diff --git a/include/clang/Sema/Overload.h b/include/clang/Sema/Overload.h
index 376db92d03bd..88fdc991f394 100644
--- a/include/clang/Sema/Overload.h
+++ b/include/clang/Sema/Overload.h
@@ -531,6 +531,13 @@ namespace clang {
       Ambiguous.construct();
     }
 
+    void setAsIdentityConversion(QualType T) {
+      setStandard();
+      Standard.setAsIdentityConversion();
+      Standard.setFromType(T);
+      Standard.setAllToTypes(T);
+    }
+
     /// \brief Whether the target is really a std::initializer_list, and the
     /// sequence only represents the worst element conversion.
     bool isStdInitializerListElement() const {
@@ -601,8 +608,17 @@ namespace clang {
 
     /// This candidate was not viable because its OpenCL extension is disabled.
     ovl_fail_ext_disabled,
+
+    /// This inherited constructor is not viable because it would slice the
+    /// argument.
+    ovl_fail_inhctor_slice,
   };
 
+  /// A list of implicit conversion sequences for the arguments of an
+  /// OverloadCandidate.
+  typedef llvm::MutableArrayRef<ImplicitConversionSequence>
+      ConversionSequenceList;
+
   /// OverloadCandidate - A single candidate in an overload set (C++ 13.3).
   struct OverloadCandidate {
     /// Function - The actual function that this candidate
@@ -627,18 +643,13 @@ namespace clang {
     /// is a surrogate, but only if IsSurrogate is true.
     CXXConversionDecl *Surrogate;
 
-    /// Conversions - The conversion sequences used to convert the
-    /// function arguments to the function parameters, the pointer points to a
-    /// fixed size array with NumConversions elements. The memory is owned by
-    /// the OverloadCandidateSet.
-    ImplicitConversionSequence *Conversions;
+    /// The conversion sequences used to convert the function arguments
+    /// to the function parameters.
+    ConversionSequenceList Conversions;
 
     /// The FixIt hints which can be used to fix the Bad candidate.
     ConversionFixItGenerator Fix;
 
-    /// NumConversions - The number of elements in the Conversions array.
-    unsigned NumConversions;
-
     /// Viable - True to indicate that this overload candidate is viable.
     bool Viable;
 
@@ -664,6 +675,26 @@ namespace clang {
     /// to be used while performing partial ordering of function templates.
     unsigned ExplicitCallArguments;
 
+    /// The number of diagnose_if attributes that this overload triggered.
+    /// If any of the triggered attributes are errors, this won't count
+    /// diagnose_if warnings.
+    unsigned NumTriggeredDiagnoseIfs = 0;
+
+    /// Basically a TinyPtrVector<DiagnoseIfAttr *> that doesn't own the vector:
+    /// If NumTriggeredDiagnoseIfs is 0 or 1, this is a DiagnoseIfAttr *,
+    /// otherwise it's a pointer to an array of `NumTriggeredDiagnoseIfs`
+    /// DiagnoseIfAttr *s.
+    llvm::PointerUnion<DiagnoseIfAttr *, DiagnoseIfAttr **> DiagnoseIfInfo;
+
+    /// Gets an ArrayRef for the data at DiagnoseIfInfo. Note that this may give
+    /// you a pointer into DiagnoseIfInfo.
+    ArrayRef<DiagnoseIfAttr *> getDiagnoseIfInfo() const {
+      auto *Ptr = NumTriggeredDiagnoseIfs <= 1
+                      ? DiagnoseIfInfo.getAddrOfPtr1()
+                      : DiagnoseIfInfo.get<DiagnoseIfAttr **>();
+      return {Ptr, NumTriggeredDiagnoseIfs};
+    }
+
     union {
       DeductionFailureInfo DeductionFailure;
       
@@ -677,9 +708,9 @@ namespace clang {
     /// hasAmbiguousConversion - Returns whether this overload
     /// candidate requires an ambiguous conversion or not.
     bool hasAmbiguousConversion() const {
-      for (unsigned i = 0, e = NumConversions; i != e; ++i) {
-        if (!Conversions[i].isInitialized()) return false;
-        if (Conversions[i].isAmbiguous()) return true;
+      for (auto &C : Conversions) {
+        if (!C.isInitialized()) return false;
+        if (C.isAmbiguous()) return true;
       }
       return false;
     }
@@ -728,17 +759,42 @@ namespace clang {
     SmallVector<OverloadCandidate, 16> Candidates;
     llvm::SmallPtrSet<Decl *, 16> Functions;
 
-    // Allocator for OverloadCandidate::Conversions. We store the first few
-    // elements inline to avoid allocation for small sets.
-    llvm::BumpPtrAllocator ConversionSequenceAllocator;
+    // Allocator for ConversionSequenceLists and DiagnoseIfAttr* arrays.
+    // We store the first few of each of these inline to avoid allocation for
+    // small sets.
+    llvm::BumpPtrAllocator SlabAllocator;
 
     SourceLocation Loc;
     CandidateSetKind Kind;
 
-    unsigned NumInlineSequences;
-    llvm::AlignedCharArray<alignof(ImplicitConversionSequence),
-                           16 * sizeof(ImplicitConversionSequence)>
-        InlineSpace;
+    constexpr static unsigned NumInlineBytes =
+        24 * sizeof(ImplicitConversionSequence);
+    unsigned NumInlineBytesUsed;
+    llvm::AlignedCharArray<alignof(void *), NumInlineBytes> InlineSpace;
+
+    /// If we have space, allocates from inline storage. Otherwise, allocates
+    /// from the slab allocator.
+    /// FIXME: It would probably be nice to have a SmallBumpPtrAllocator
+    /// instead.
+    template <typename T>
+    T *slabAllocate(unsigned N) {
+      // It's simpler if this doesn't need to consider alignment.
+      static_assert(alignof(T) == alignof(void *),
+                    "Only works for pointer-aligned types.");
+      static_assert(std::is_trivial<T>::value ||
+                        std::is_same<ImplicitConversionSequence, T>::value,
+                    "Add destruction logic to OverloadCandidateSet::clear().");
+
+      unsigned NBytes = sizeof(T) * N;
+      if (NBytes > NumInlineBytes - NumInlineBytesUsed)
+        return SlabAllocator.Allocate<T>(N);
+      char *FreeSpaceStart = InlineSpace.buffer + NumInlineBytesUsed;
+      assert(uintptr_t(FreeSpaceStart) % alignof(void *) == 0 &&
+             "Misaligned storage!");
+
+      NumInlineBytesUsed += NBytes;
+      return reinterpret_cast<T *>(FreeSpaceStart);
+    }
 
     OverloadCandidateSet(const OverloadCandidateSet &) = delete;
     void operator=(const OverloadCandidateSet &) = delete;
@@ -747,12 +803,17 @@ namespace clang {
 
   public:
     OverloadCandidateSet(SourceLocation Loc, CandidateSetKind CSK)
-        : Loc(Loc), Kind(CSK), NumInlineSequences(0) {}
+        : Loc(Loc), Kind(CSK), NumInlineBytesUsed(0) {}
     ~OverloadCandidateSet() { destroyCandidates(); }
 
     SourceLocation getLocation() const { return Loc; }
     CandidateSetKind getKind() const { return Kind; }
 
+    /// Make a DiagnoseIfAttr* array in a block of memory that will live for
+    /// as long as this OverloadCandidateSet. Returns a pointer to the start
+    /// of that array.
+    DiagnoseIfAttr **addDiagnoseIfComplaints(ArrayRef<DiagnoseIfAttr *> CA);
+
     /// \brief Determine when this overload candidate will be new to the
     /// overload set.
     bool isNewCandidate(Decl *F) {
@@ -769,30 +830,32 @@ namespace clang {
     size_t size() const { return Candidates.size(); }
     bool empty() const { return Candidates.empty(); }
 
+    /// \brief Allocate storage for conversion sequences for NumConversions
+    /// conversions.
+    ConversionSequenceList
+    allocateConversionSequences(unsigned NumConversions) {
+      ImplicitConversionSequence *Conversions =
+          slabAllocate<ImplicitConversionSequence>(NumConversions);
+
+      // Construct the new objects.
+      for (unsigned I = 0; I != NumConversions; ++I)
+        new (&Conversions[I]) ImplicitConversionSequence();
+
+      return ConversionSequenceList(Conversions, NumConversions);
+    }
+
     /// \brief Add a new candidate with NumConversions conversion sequence slots
     /// to the overload set.
-    OverloadCandidate &addCandidate(unsigned NumConversions = 0) {
+    OverloadCandidate &addCandidate(unsigned NumConversions = 0,
+                                    ConversionSequenceList Conversions = None) {
+      assert((Conversions.empty() || Conversions.size() == NumConversions) &&
+             "preallocated conversion sequence has wrong length");
+
       Candidates.push_back(OverloadCandidate());
       OverloadCandidate &C = Candidates.back();
-
-      // Assign space from the inline array if there are enough free slots
-      // available.
-      if (NumConversions + NumInlineSequences <= 16) {
-        ImplicitConversionSequence *I =
-            (ImplicitConversionSequence *)InlineSpace.buffer;
-        C.Conversions = &I[NumInlineSequences];
-        NumInlineSequences += NumConversions;
-      } else {
-        // Otherwise get memory from the allocator.
-        C.Conversions = ConversionSequenceAllocator
-                          .Allocate<ImplicitConversionSequence>(NumConversions);
-      }
-
-      // Construct the new objects.
-      for (unsigned i = 0; i != NumConversions; ++i)
-        new (&C.Conversions[i]) ImplicitConversionSequence();
-
-      C.NumConversions = NumConversions;
+      C.Conversions = Conversions.empty()
+                          ? allocateConversionSequences(NumConversions)
+                          : Conversions;
       return C;
     }
 
diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index ca984a360a60..d5e4b069f8b7 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -27,6 +27,7 @@
 #include "clang/AST/NSAPI.h"
 #include "clang/AST/PrettyPrinter.h"
 #include "clang/AST/TypeLoc.h"
+#include "clang/AST/TypeOrdering.h"
 #include "clang/Basic/ExpressionTraits.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/Module.h"
@@ -119,6 +120,7 @@ namespace clang {
   class FunctionProtoType;
   class FunctionTemplateDecl;
   class ImplicitConversionSequence;
+  typedef MutableArrayRef<ImplicitConversionSequence> ConversionSequenceList;
   class InitListExpr;
   class InitializationKind;
   class InitializationSequence;
@@ -806,6 +808,12 @@ public:
     /// run time.
     Unevaluated,
 
+    /// \brief The current expression occurs within a braced-init-list within
+    /// an unevaluated operand. This is mostly like a regular unevaluated
+    /// context, except that we still instantiate constexpr functions that are
+    /// referenced here so that we can perform narrowing checks correctly.
+    UnevaluatedList,
+
     /// \brief The current expression occurs within a discarded statement.
     /// This behaves largely similarly to an unevaluated operand in preventing
     /// definitions from being required, but not in other ways.
@@ -898,7 +906,8 @@ public:
     MangleNumberingContext &getMangleNumberingContext(ASTContext &Ctx);
 
     bool isUnevaluated() const {
-      return Context == Unevaluated || Context == UnevaluatedAbstract;
+      return Context == Unevaluated || Context == UnevaluatedAbstract ||
+             Context == UnevaluatedList;
     }
   };
 
@@ -2510,10 +2519,11 @@ public:
   void AddOverloadCandidate(FunctionDecl *Function,
                             DeclAccessPair FoundDecl,
                             ArrayRef<Expr *> Args,
-                            OverloadCandidateSet& CandidateSet,
+                            OverloadCandidateSet &CandidateSet,
                             bool SuppressUserConversions = false,
                             bool PartialOverloading = false,
-                            bool AllowExplicit = false);
+                            bool AllowExplicit = false,
+                            ConversionSequenceList EarlyConversions = None);
   void AddFunctionCandidates(const UnresolvedSetImpl &Functions,
                       ArrayRef<Expr *> Args,
                       OverloadCandidateSet &CandidateSet,
@@ -2523,23 +2533,25 @@ public:
   void AddMethodCandidate(DeclAccessPair FoundDecl,
                           QualType ObjectType,
                           Expr::Classification ObjectClassification,
-                          ArrayRef<Expr *> Args,
+                          Expr *ThisArg, ArrayRef<Expr *> Args,
                           OverloadCandidateSet& CandidateSet,
                           bool SuppressUserConversion = false);
   void AddMethodCandidate(CXXMethodDecl *Method,
                           DeclAccessPair FoundDecl,
                           CXXRecordDecl *ActingContext, QualType ObjectType,
                           Expr::Classification ObjectClassification,
-                          ArrayRef<Expr *> Args,
+                          Expr *ThisArg, ArrayRef<Expr *> Args,
                           OverloadCandidateSet& CandidateSet,
                           bool SuppressUserConversions = false,
-                          bool PartialOverloading = false);
+                          bool PartialOverloading = false,
+                          ConversionSequenceList EarlyConversions = None);
   void AddMethodTemplateCandidate(FunctionTemplateDecl *MethodTmpl,
                                   DeclAccessPair FoundDecl,
                                   CXXRecordDecl *ActingContext,
                                  TemplateArgumentListInfo *ExplicitTemplateArgs,
                                   QualType ObjectType,
                                   Expr::Classification ObjectClassification,
+                                  Expr *ThisArg,
                                   ArrayRef<Expr *> Args,
                                   OverloadCandidateSet& CandidateSet,
                                   bool SuppressUserConversions = false,
@@ -2551,6 +2563,16 @@ public:
                                     OverloadCandidateSet& CandidateSet,
                                     bool SuppressUserConversions = false,
                                     bool PartialOverloading = false);
+  bool CheckNonDependentConversions(FunctionTemplateDecl *FunctionTemplate,
+                                    ArrayRef<QualType> ParamTypes,
+                                    ArrayRef<Expr *> Args,
+                                    OverloadCandidateSet &CandidateSet,
+                                    ConversionSequenceList &Conversions,
+                                    bool SuppressUserConversions,
+                                    CXXRecordDecl *ActingContext = nullptr,
+                                    QualType ObjectType = QualType(),
+                                    Expr::Classification
+                                        ObjectClassification = {});
   void AddConversionCandidate(CXXConversionDecl *Conversion,
                               DeclAccessPair FoundDecl,
                               CXXRecordDecl *ActingContext,
@@ -2603,6 +2625,38 @@ public:
   EnableIfAttr *CheckEnableIf(FunctionDecl *Function, ArrayRef<Expr *> Args,
                               bool MissingImplicitThis = false);
 
+  /// Check the diagnose_if attributes on the given function. Returns the
+  /// first succesful fatal attribute, or null if calling Function(Args) isn't
+  /// an error.
+  ///
+  /// This only considers ArgDependent DiagnoseIfAttrs.
+  ///
+  /// This will populate Nonfatal with all non-error DiagnoseIfAttrs that
+  /// succeed. If this function returns non-null, the contents of Nonfatal are
+  /// unspecified.
+  DiagnoseIfAttr *
+  checkArgDependentDiagnoseIf(FunctionDecl *Function, ArrayRef<Expr *> Args,
+                              SmallVectorImpl<DiagnoseIfAttr *> &Nonfatal,
+                              bool MissingImplicitThis = false,
+                              Expr *ThisArg = nullptr);
+
+  /// Check the diagnose_if expressions on the given function. Returns the
+  /// first succesful fatal attribute, or null if using Function isn't
+  /// an error.
+  ///
+  /// This ignores all ArgDependent DiagnoseIfAttrs.
+  ///
+  /// This will populate Nonfatal with all non-error DiagnoseIfAttrs that
+  /// succeed. If this function returns non-null, the contents of Nonfatal are
+  /// unspecified.
+  DiagnoseIfAttr *
+  checkArgIndependentDiagnoseIf(FunctionDecl *Function,
+                                SmallVectorImpl<DiagnoseIfAttr *> &Nonfatal);
+
+  /// Emits the diagnostic contained in the given DiagnoseIfAttr at Loc. Also
+  /// emits a note about the location of said attribute.
+  void emitDiagnoseIfDiagnostic(SourceLocation Loc, const DiagnoseIfAttr *DIA);
+
   /// Returns whether the given function's address can be taken or not,
   /// optionally emitting a diagnostic if the address can't be taken.
   ///
@@ -3801,6 +3855,9 @@ public:
   /// variable will have in the given scope.
   QualType getCapturedDeclRefType(VarDecl *Var, SourceLocation Loc);
 
+  /// Mark all of the declarations referenced within a particular AST node as
+  /// referenced. Used when template instantiation instantiates a non-dependent
+  /// type -- entities referenced by the type are now referenced.
   void MarkDeclarationsReferencedInType(SourceLocation Loc, QualType T);
   void MarkDeclarationsReferencedInExpr(Expr *E,
                                         bool SkipLocalVariables = false);
@@ -6580,6 +6637,8 @@ public:
     /// \brief The explicitly-specified template arguments were not valid
     /// template arguments for the given template.
     TDK_InvalidExplicitArguments,
+    /// \brief Checking non-dependent argument conversions failed.
+    TDK_NonDependentConversionFailure,
     /// \brief Deduction failed; that's all we know.
     TDK_MiscellaneousDeductionFailure,
     /// \brief CUDA Target attributes do not match.
@@ -6618,22 +6677,21 @@ public:
     QualType OriginalArgType;
   };
 
-  TemplateDeductionResult
-  FinishTemplateArgumentDeduction(FunctionTemplateDecl *FunctionTemplate,
-                      SmallVectorImpl<DeducedTemplateArgument> &Deduced,
-                                  unsigned NumExplicitlySpecified,
-                                  FunctionDecl *&Specialization,
-                                  sema::TemplateDeductionInfo &Info,
-           SmallVectorImpl<OriginalCallArg> const *OriginalCallArgs = nullptr,
-                                  bool PartialOverloading = false);
+  TemplateDeductionResult FinishTemplateArgumentDeduction(
+      FunctionTemplateDecl *FunctionTemplate,
+      SmallVectorImpl<DeducedTemplateArgument> &Deduced,
+      unsigned NumExplicitlySpecified, FunctionDecl *&Specialization,
+      sema::TemplateDeductionInfo &Info,
+      SmallVectorImpl<OriginalCallArg> const *OriginalCallArgs = nullptr,
+      bool PartialOverloading = false,
+      llvm::function_ref<bool()> CheckNonDependent = []{ return false; });
 
-  TemplateDeductionResult
-  DeduceTemplateArguments(FunctionTemplateDecl *FunctionTemplate,
-                          TemplateArgumentListInfo *ExplicitTemplateArgs,
-                          ArrayRef<Expr *> Args,
-                          FunctionDecl *&Specialization,
-                          sema::TemplateDeductionInfo &Info,
-                          bool PartialOverloading = false);
+  TemplateDeductionResult DeduceTemplateArguments(
+      FunctionTemplateDecl *FunctionTemplate,
+      TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef<Expr *> Args,
+      FunctionDecl *&Specialization, sema::TemplateDeductionInfo &Info,
+      bool PartialOverloading,
+      llvm::function_ref<bool(ArrayRef<QualType>)> CheckNonDependent);
 
   TemplateDeductionResult
   DeduceTemplateArguments(FunctionTemplateDecl *FunctionTemplate,
@@ -6877,6 +6935,10 @@ public:
   /// Specializations whose definitions are currently being instantiated.
   llvm::DenseSet<std::pair<Decl *, unsigned>> InstantiatingSpecializations;
 
+  /// Non-dependent types used in templates that have already been instantiated
+  /// by some template instantiation.
+  llvm::DenseSet<QualType> InstantiatedNonDependentTypes;
+
   /// \brief Extra modules inspected when performing a lookup during a template
   /// instantiation. Computed lazily.
   SmallVector<Module*, 16> ActiveTemplateInstantiationLookupModules;
@@ -10186,6 +10248,22 @@ public:
                                             IsDecltype);
   }
 
+  enum InitListTag { InitList };
+  EnterExpressionEvaluationContext(Sema &Actions, InitListTag,
+                                   bool ShouldEnter = true)
+      : Actions(Actions), Entered(false) {
+    // In C++11 onwards, narrowing checks are performed on the contents of
+    // braced-init-lists, even when they occur within unevaluated operands.
+    // Therefore we still need to instantiate constexpr functions used in such
+    // a context.
+    if (ShouldEnter && Actions.isUnevaluatedContext() &&
+        Actions.getLangOpts().CPlusPlus11) {
+      Actions.PushExpressionEvaluationContext(Sema::UnevaluatedList, nullptr,
+                                              false);
+      Entered = true;
+    }
+  }
+
   ~EnterExpressionEvaluationContext() {
     if (Entered)
       Actions.PopExpressionEvaluationContext();
diff --git a/include/clang/StaticAnalyzer/Checkers/Checkers.td b/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 2e58debf80fc..69578910499f 100644
--- a/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -278,6 +278,14 @@ def VirtualCallChecker : Checker<"VirtualCall">,
 
 } // end: "optin.cplusplus"
 
+let ParentPackage = CplusplusAlpha in {
+
+def IteratorPastEndChecker : Checker<"IteratorPastEnd">,
+  HelpText<"Check iterators used past end">,
+  DescFile<"IteratorPastEndChecker.cpp">;
+
+} // end: "alpha.cplusplus"
+
 
 //===----------------------------------------------------------------------===//
 // Valist checkers.
diff --git a/lib/AST/ExprConstant.cpp b/lib/AST/ExprConstant.cpp
index 6dcb705c44d3..fe77c7f6f3bf 100644
--- a/lib/AST/ExprConstant.cpp
+++ b/lib/AST/ExprConstant.cpp
@@ -4543,6 +4543,12 @@ public:
                              Call.getLValueBase().dyn_cast<const ValueDecl*>());
       if (!FD)
         return Error(Callee);
+      // Don't call function pointers which have been cast to some other type.
+      // Per DR (no number yet), the caller and callee can differ in noexcept.
+      if (!Info.Ctx.hasSameFunctionTypeIgnoringExceptionSpec(
+        CalleeType->getPointeeType(), FD->getType())) {
+        return Error(E);
+      }
 
       // Overloaded operator calls to member functions are represented as normal
       // calls with '*this' as the first argument.
@@ -4558,14 +4564,42 @@ public:
           return false;
         This = &ThisVal;
         Args = Args.slice(1);
+      } else if (MD && MD->isLambdaStaticInvoker()) {   
+        // Map the static invoker for the lambda back to the call operator.
+        // Conveniently, we don't have to slice out the 'this' argument (as is
+        // being done for the non-static case), since a static member function
+        // doesn't have an implicit argument passed in.
+        const CXXRecordDecl *ClosureClass = MD->getParent();
+        assert(
+            ClosureClass->captures_begin() == ClosureClass->captures_end() &&
+            "Number of captures must be zero for conversion to function-ptr");
+
+        const CXXMethodDecl *LambdaCallOp =
+            ClosureClass->getLambdaCallOperator();
+
+        // Set 'FD', the function that will be called below, to the call
+        // operator.  If the closure object represents a generic lambda, find
+        // the corresponding specialization of the call operator.
+
+        if (ClosureClass->isGenericLambda()) {
+          assert(MD->isFunctionTemplateSpecialization() &&
+                 "A generic lambda's static-invoker function must be a "
+                 "template specialization");
+          const TemplateArgumentList *TAL = MD->getTemplateSpecializationArgs();
+          FunctionTemplateDecl *CallOpTemplate =
+              LambdaCallOp->getDescribedFunctionTemplate();
+          void *InsertPos = nullptr;
+          FunctionDecl *CorrespondingCallOpSpecialization =
+              CallOpTemplate->findSpecialization(TAL->asArray(), InsertPos);
+          assert(CorrespondingCallOpSpecialization &&
+                 "We must always have a function call operator specialization "
+                 "that corresponds to our static invoker specialization");
+          FD = cast<CXXMethodDecl>(CorrespondingCallOpSpecialization);
+        } else
+          FD = LambdaCallOp;
       }
 
-      // Don't call function pointers which have been cast to some other type.
-      // Per DR (no number yet), the caller and callee can differ in noexcept.
-      if (!Info.Ctx.hasSameFunctionTypeIgnoringExceptionSpec(
-              CalleeType->getPointeeType(), FD->getType())) {
-        return Error(E);
-      }
+      
     } else
       return Error(E);
 
@@ -5834,6 +5868,7 @@ namespace {
     bool VisitCXXConstructExpr(const CXXConstructExpr *E) {
       return VisitCXXConstructExpr(E, E->getType());
     }
+    bool VisitLambdaExpr(const LambdaExpr *E);
     bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E);
     bool VisitCXXConstructExpr(const CXXConstructExpr *E, QualType T);
     bool VisitCXXStdInitializerListExpr(const CXXStdInitializerListExpr *E);
@@ -6168,6 +6203,21 @@ bool RecordExprEvaluator::VisitCXXStdInitializerListExpr(
   return true;
 }
 
+bool RecordExprEvaluator::VisitLambdaExpr(const LambdaExpr *E) {
+  const CXXRecordDecl *ClosureClass = E->getLambdaClass();
+  if (ClosureClass->isInvalidDecl()) return false;
+
+  if (Info.checkingPotentialConstantExpression()) return true;
+  if (E->capture_size()) {
+    Info.FFDiag(E, diag::note_unimplemented_constexpr_lambda_feature_ast)
+        << "can not evaluate lambda expressions with captures";
+    return false;
+  }
+  // FIXME: Implement captures.
+  Result = APValue(APValue::UninitStruct(), /*NumBases*/0, /*NumFields*/0);
+  return true;
+}
+
 static bool EvaluateRecord(const Expr *E, const LValue &This,
                            APValue &Result, EvalInfo &Info) {
   assert(E->isRValue() && E->getType()->isRecordType() &&
@@ -6217,6 +6267,9 @@ public:
   bool VisitCXXStdInitializerListExpr(const CXXStdInitializerListExpr *E) {
     return VisitConstructExpr(E);
   }
+  bool VisitLambdaExpr(const LambdaExpr *E) {
+    return VisitConstructExpr(E);
+  }
 };
 } // end anonymous namespace
 
@@ -10357,10 +10410,25 @@ bool Expr::isCXX11ConstantExpr(const ASTContext &Ctx, APValue *Result,
 
 bool Expr::EvaluateWithSubstitution(APValue &Value, ASTContext &Ctx,
                                     const FunctionDecl *Callee,
-                                    ArrayRef<const Expr*> Args) const {
+                                    ArrayRef<const Expr*> Args,
+                                    const Expr *This) const {
   Expr::EvalStatus Status;
   EvalInfo Info(Ctx, Status, EvalInfo::EM_ConstantExpressionUnevaluated);
 
+  LValue ThisVal;
+  const LValue *ThisPtr = nullptr;
+  if (This) {
+#ifndef NDEBUG
+    auto *MD = dyn_cast<CXXMethodDecl>(Callee);
+    assert(MD && "Don't provide `this` for non-methods.");
+    assert(!MD->isStatic() && "Don't provide `this` for static methods.");
+#endif
+    if (EvaluateObjectArgument(Info, This, ThisVal))
+      ThisPtr = &ThisVal;
+    if (Info.EvalStatus.HasSideEffects)
+      return false;
+  }
+
   ArgVector ArgValues(Args.size());
   for (ArrayRef<const Expr*>::iterator I = Args.begin(), E = Args.end();
        I != E; ++I) {
@@ -10373,7 +10441,7 @@ bool Expr::EvaluateWithSubstitution(APValue &Value, ASTContext &Ctx,
   }
 
   // Build fake call to Callee.
-  CallStackFrame Frame(Info, Callee->getLocation(), Callee, /*This*/nullptr,
+  CallStackFrame Frame(Info, Callee->getLocation(), Callee, ThisPtr,
                        ArgValues.data());
   return Evaluate(Value, Info, this) && !Info.EvalStatus.HasSideEffects;
 }
diff --git a/lib/AST/MicrosoftMangle.cpp b/lib/AST/MicrosoftMangle.cpp
index 911b8b471a05..76c368d7f04c 100644
--- a/lib/AST/MicrosoftMangle.cpp
+++ b/lib/AST/MicrosoftMangle.cpp
@@ -109,13 +109,13 @@ static const DeclContext *getEffectiveParentContext(const DeclContext *DC) {
 
 static const FunctionDecl *getStructor(const NamedDecl *ND) {
   if (const auto *FTD = dyn_cast<FunctionTemplateDecl>(ND))
-    return FTD->getTemplatedDecl();
+    return FTD->getTemplatedDecl()->getCanonicalDecl();
 
   const auto *FD = cast<FunctionDecl>(ND);
   if (const auto *FTD = FD->getPrimaryTemplate())
-    return FTD->getTemplatedDecl();
+    return FTD->getTemplatedDecl()->getCanonicalDecl();
 
-  return FD;
+  return FD->getCanonicalDecl();
 }
 
 /// MicrosoftMangleContextImpl - Overrides the default MangleContext for the
@@ -312,6 +312,10 @@ public:
   void mangleNestedName(const NamedDecl *ND);
 
 private:
+  bool isStructorDecl(const NamedDecl *ND) const {
+    return ND == Structor || getStructor(ND) == Structor;
+  }
+
   void mangleUnqualifiedName(const NamedDecl *ND) {
     mangleUnqualifiedName(ND, ND->getDeclName());
   }
@@ -898,7 +902,7 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(const NamedDecl *ND,
       llvm_unreachable("Can't mangle Objective-C selector names here!");
 
     case DeclarationName::CXXConstructorName:
-      if (Structor == getStructor(ND)) {
+      if (isStructorDecl(ND)) {
         if (StructorType == Ctor_CopyingClosure) {
           Out << "?_O";
           return;
@@ -912,7 +916,7 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(const NamedDecl *ND,
       return;
 
     case DeclarationName::CXXDestructorName:
-      if (ND == Structor)
+      if (isStructorDecl(ND))
         // If the named decl is the C++ destructor we're mangling,
         // use the type we were given.
         mangleCXXDtorType(static_cast<CXXDtorType>(StructorType));
@@ -1862,7 +1866,7 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T,
       IsStructor = true;
       IsCtorClosure = (StructorType == Ctor_CopyingClosure ||
                        StructorType == Ctor_DefaultClosure) &&
-                      getStructor(MD) == Structor;
+                      isStructorDecl(MD);
       if (IsCtorClosure)
         CC = getASTContext().getDefaultCallingConvention(
             /*IsVariadic=*/false, /*IsCXXMethod=*/true);
@@ -1883,7 +1887,7 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T,
   // <return-type> ::= <type>
   //               ::= @ # structors (they have no declared return type)
   if (IsStructor) {
-    if (isa<CXXDestructorDecl>(D) && D == Structor &&
+    if (isa<CXXDestructorDecl>(D) && isStructorDecl(D) &&
         StructorType == Dtor_Deleting) {
       // The scalar deleting destructor takes an extra int argument.
       // However, the FunctionType generated has 0 arguments.
diff --git a/lib/CodeGen/BackendUtil.cpp b/lib/CodeGen/BackendUtil.cpp
index ed09f3a45566..d2ce6ea48e41 100644
--- a/lib/CodeGen/BackendUtil.cpp
+++ b/lib/CodeGen/BackendUtil.cpp
@@ -312,7 +312,8 @@ void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM,
   // At O0 and O1 we only run the always inliner which is more efficient. At
   // higher optimization levels we run the normal inliner.
   if (CodeGenOpts.OptimizationLevel <= 1) {
-    bool InsertLifetimeIntrinsics = CodeGenOpts.OptimizationLevel != 0;
+    bool InsertLifetimeIntrinsics = (CodeGenOpts.OptimizationLevel != 0 &&
+                                     !CodeGenOpts.DisableLifetimeMarkers);
     PMBuilder.Inliner = createAlwaysInlinerLegacyPass(InsertLifetimeIntrinsics);
   } else {
     PMBuilder.Inliner = createFunctionInliningPass(
@@ -519,11 +520,22 @@ void EmitAssemblyHelper::CreateTargetMachine(bool MustCreateTM) {
            .Case("dynamic-no-pic", llvm::Reloc::DynamicNoPIC);
   assert(RM.hasValue() && "invalid PIC model!");
 
-  CodeGenOpt::Level OptLevel = CodeGenOpt::Default;
+  CodeGenOpt::Level OptLevel;
   switch (CodeGenOpts.OptimizationLevel) {
-  default: break;
-  case 0: OptLevel = CodeGenOpt::None; break;
-  case 3: OptLevel = CodeGenOpt::Aggressive; break;
+  default:
+    llvm_unreachable("Invalid optimization level!");
+  case 0:
+    OptLevel = CodeGenOpt::None;
+    break;
+  case 1:
+    OptLevel = CodeGenOpt::Less;
+    break;
+  case 2:
+    OptLevel = CodeGenOpt::Default;
+    break; // O2/Os/Oz
+  case 3:
+    OptLevel = CodeGenOpt::Aggressive;
+    break;
   }
 
   llvm::TargetOptions Options;
@@ -849,21 +861,8 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
   }
 }
 
-static void runThinLTOBackend(const CodeGenOptions &CGOpts, Module *M,
+static void runThinLTOBackend(ModuleSummaryIndex *CombinedIndex, Module *M,
                               std::unique_ptr<raw_pwrite_stream> OS) {
-  // If we are performing a ThinLTO importing compile, load the function index
-  // into memory and pass it into thinBackend, which will run the function
-  // importer and invoke LTO passes.
-  Expected<std::unique_ptr<ModuleSummaryIndex>> IndexOrErr =
-      llvm::getModuleSummaryIndexForFile(CGOpts.ThinLTOIndexFile);
-  if (!IndexOrErr) {
-    logAllUnhandledErrors(IndexOrErr.takeError(), errs(),
-                          "Error loading index file '" +
-                              CGOpts.ThinLTOIndexFile + "': ");
-    return;
-  }
-  std::unique_ptr<ModuleSummaryIndex> CombinedIndex = std::move(*IndexOrErr);
-
   StringMap<std::map<GlobalValue::GUID, GlobalValueSummary *>>
       ModuleToDefinedGVSummaries;
   CombinedIndex->collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
@@ -949,8 +948,26 @@ void clang::EmitBackendOutput(DiagnosticsEngine &Diags,
                               BackendAction Action,
                               std::unique_ptr<raw_pwrite_stream> OS) {
   if (!CGOpts.ThinLTOIndexFile.empty()) {
-    runThinLTOBackend(CGOpts, M, std::move(OS));
-    return;
+    // If we are performing a ThinLTO importing compile, load the function index
+    // into memory and pass it into runThinLTOBackend, which will run the
+    // function importer and invoke LTO passes.
+    Expected<std::unique_ptr<ModuleSummaryIndex>> IndexOrErr =
+        llvm::getModuleSummaryIndexForFile(CGOpts.ThinLTOIndexFile);
+    if (!IndexOrErr) {
+      logAllUnhandledErrors(IndexOrErr.takeError(), errs(),
+                            "Error loading index file '" +
+                            CGOpts.ThinLTOIndexFile + "': ");
+      return;
+    }
+    std::unique_ptr<ModuleSummaryIndex> CombinedIndex = std::move(*IndexOrErr);
+    // A null CombinedIndex means we should skip ThinLTO compilation
+    // (LLVM will optionally ignore empty index files, returning null instead
+    // of an error).
+    bool DoThinLTOBackend = CombinedIndex != nullptr;
+    if (DoThinLTOBackend) {
+      runThinLTOBackend(CombinedIndex.get(), M, std::move(OS));
+      return;
+    }
   }
 
   EmitAssemblyHelper AsmHelper(Diags, HeaderOpts, CGOpts, TOpts, LOpts, M);
diff --git a/lib/CodeGen/CGCleanup.h b/lib/CodeGen/CGCleanup.h
index 2166490ec1fd..105c5629d50c 100644
--- a/lib/CodeGen/CGCleanup.h
+++ b/lib/CodeGen/CGCleanup.h
@@ -616,6 +616,8 @@ struct EHPersonality {
   static const EHPersonality GNU_C_SJLJ;
   static const EHPersonality GNU_C_SEH;
   static const EHPersonality GNU_ObjC;
+  static const EHPersonality GNU_ObjC_SJLJ;
+  static const EHPersonality GNU_ObjC_SEH;
   static const EHPersonality GNUstep_ObjC;
   static const EHPersonality GNU_ObjCXX;
   static const EHPersonality NeXT_ObjC;
diff --git a/lib/CodeGen/CGException.cpp b/lib/CodeGen/CGException.cpp
index 7b7880e07a95..f908bf2b3b0a 100644
--- a/lib/CodeGen/CGException.cpp
+++ b/lib/CodeGen/CGException.cpp
@@ -97,6 +97,10 @@ EHPersonality::GNU_CPlusPlus_SEH = { "__gxx_personality_seh0", nullptr };
 const EHPersonality
 EHPersonality::GNU_ObjC = {"__gnu_objc_personality_v0", "objc_exception_throw"};
 const EHPersonality
+EHPersonality::GNU_ObjC_SJLJ = {"__gnu_objc_personality_sj0", "objc_exception_throw"};
+const EHPersonality
+EHPersonality::GNU_ObjC_SEH = {"__gnu_objc_personality_seh0", "objc_exception_throw"};
+const EHPersonality
 EHPersonality::GNU_ObjCXX = { "__gnustep_objcxx_personality_v0", nullptr };
 const EHPersonality
 EHPersonality::GNUstep_ObjC = { "__gnustep_objc_personality_v0", nullptr };
@@ -137,6 +141,10 @@ static const EHPersonality &getObjCPersonality(const llvm::Triple &T,
     // fallthrough
   case ObjCRuntime::GCC:
   case ObjCRuntime::ObjFW:
+    if (L.SjLjExceptions)
+      return EHPersonality::GNU_ObjC_SJLJ;
+    else if (useLibGCCSEHPersonality(T))
+      return EHPersonality::GNU_ObjC_SEH;
     return EHPersonality::GNU_ObjC;
   }
   llvm_unreachable("bad runtime kind");
diff --git a/lib/CodeGen/CodeGenFunction.cpp b/lib/CodeGen/CodeGenFunction.cpp
index 7cab13de923b..137c69420ddf 100644
--- a/lib/CodeGen/CodeGenFunction.cpp
+++ b/lib/CodeGen/CodeGenFunction.cpp
@@ -42,6 +42,9 @@ using namespace CodeGen;
 /// markers.
 static bool shouldEmitLifetimeMarkers(const CodeGenOptions &CGOpts,
                                       const LangOptions &LangOpts) {
+  if (CGOpts.DisableLifetimeMarkers)
+    return false;
+
   // Asan uses markers for use-after-scope checks.
   if (CGOpts.SanitizeAddressUseAfterScope)
     return true;
diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp
index 789a2f0525be..547e660ae09b 100644
--- a/lib/Driver/ToolChains.cpp
+++ b/lib/Driver/ToolChains.cpp
@@ -3812,6 +3812,7 @@ ToolChain::CXXStdlibType NetBSD::GetDefaultCXXStdlibType() const {
   if (Major >= 7 || Major == 0) {
     switch (getArch()) {
     case llvm::Triple::aarch64:
+    case llvm::Triple::aarch64_be:
     case llvm::Triple::arm:
     case llvm::Triple::armeb:
     case llvm::Triple::thumb:
diff --git a/lib/Driver/Tools.cpp b/lib/Driver/Tools.cpp
index 8e02d45fcc4a..e267cdb2649f 100644
--- a/lib/Driver/Tools.cpp
+++ b/lib/Driver/Tools.cpp
@@ -9644,6 +9644,7 @@ void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (Major >= 7 || Major == 0) {
     switch (getToolChain().getArch()) {
     case llvm::Triple::aarch64:
+    case llvm::Triple::aarch64_be:
     case llvm::Triple::arm:
     case llvm::Triple::armeb:
     case llvm::Triple::thumb:
diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp
index cf6373f45657..b5f7de280acd 100644
--- a/lib/Format/TokenAnnotator.cpp
+++ b/lib/Format/TokenAnnotator.cpp
@@ -1282,9 +1282,7 @@ private:
       return TT_UnaryOperator;
 
     const FormatToken *NextToken = Tok.getNextNonComment();
-    if (!NextToken ||
-        NextToken->isOneOf(tok::arrow, Keywords.kw_final, tok::equal,
-                           Keywords.kw_override) ||
+    if (!NextToken || NextToken->isOneOf(tok::arrow, tok::equal) ||
         (NextToken->is(tok::l_brace) && !NextToken->getNextNonComment()))
       return TT_PointerOrReference;
 
@@ -2088,9 +2086,9 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
         !Line.IsMultiVariableDeclStmt)))
     return true;
   if (Left.is(TT_PointerOrReference))
-    return Right.Tok.isLiteral() ||
-           Right.isOneOf(TT_BlockComment, Keywords.kw_final,
-                         Keywords.kw_override) ||
+    return Right.Tok.isLiteral() || Right.is(TT_BlockComment) ||
+           (Right.isOneOf(Keywords.kw_override, Keywords.kw_final) &&
+            !Right.is(TT_StartOfName)) ||
            (Right.is(tok::l_brace) && Right.BlockKind == BK_Block) ||
            (!Right.isOneOf(TT_PointerOrReference, TT_ArraySubscriptLSquare,
                            tok::l_paren) &&
diff --git a/lib/Format/UnwrappedLineParser.cpp b/lib/Format/UnwrappedLineParser.cpp
index 370cf7afa330..8fc3b78aee01 100644
--- a/lib/Format/UnwrappedLineParser.cpp
+++ b/lib/Format/UnwrappedLineParser.cpp
@@ -737,7 +737,7 @@ void UnwrappedLineParser::readTokenWithJavaScriptASI() {
       return;
   }
   if (Next->is(tok::exclaim) && PreviousMustBeValue)
-    addUnwrappedLine();
+    return addUnwrappedLine();
   bool NextMustBeValue = mustBeJSIdentOrValue(Keywords, Next);
   bool NextEndsTemplateExpr =
       Next->is(TT_TemplateString) && Next->TokenText.startswith("}");
@@ -745,9 +745,10 @@ void UnwrappedLineParser::readTokenWithJavaScriptASI() {
       (PreviousMustBeValue ||
        Previous->isOneOf(tok::r_square, tok::r_paren, tok::plusplus,
                          tok::minusminus)))
-    addUnwrappedLine();
-  if (PreviousMustBeValue && isJSDeclOrStmt(Keywords, Next))
-    addUnwrappedLine();
+    return addUnwrappedLine();
+  if ((PreviousMustBeValue || Previous->is(tok::r_brace)) &&
+      isJSDeclOrStmt(Keywords, Next))
+    return addUnwrappedLine();
 }
 
 void UnwrappedLineParser::parseStructuralElement() {
@@ -1974,7 +1975,14 @@ void UnwrappedLineParser::parseJavaScriptEs6ImportExport() {
       !FormatTok->isStringLiteral())
     return;
 
-  while (!eof() && FormatTok->isNot(tok::semi)) {
+  while (!eof()) {
+    if (FormatTok->is(tok::semi))
+      return;
+    if (Line->Tokens.size() == 0) {
+      // Common issue: Automatic Semicolon Insertion wrapped the line, so the
+      // import statement should terminate.
+      return;
+    }
     if (FormatTok->is(tok::l_brace)) {
       FormatTok->BlockKind = BK_Block;
       parseBracedList();
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp
index 93bbcc42da1a..36f6b0a5111a 100644
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -520,6 +520,7 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
     Opts.EmitLLVMUseLists = A->getOption().getID() == OPT_emit_llvm_uselists;
 
   Opts.DisableLLVMPasses = Args.hasArg(OPT_disable_llvm_passes);
+  Opts.DisableLifetimeMarkers = Args.hasArg(OPT_disable_lifetimemarkers);
   Opts.DisableRedZone = Args.hasArg(OPT_disable_red_zone);
   Opts.ForbidGuardVariables = Args.hasArg(OPT_fforbid_guard_variables);
   Opts.UseRegisterSizedBitfieldAccess = Args.hasArg(
diff --git a/lib/Frontend/FrontendActions.cpp b/lib/Frontend/FrontendActions.cpp
index eb91940cbbfc..f795a1d0475a 100644
--- a/lib/Frontend/FrontendActions.cpp
+++ b/lib/Frontend/FrontendActions.cpp
@@ -127,6 +127,12 @@ GeneratePCHAction::ComputeASTConsumerArguments(CompilerInstance &CI,
   return OS;
 }
 
+bool GeneratePCHAction::BeginSourceFileAction(CompilerInstance &CI,
+                                              StringRef Filename) {
+  CI.getLangOpts().CompilingPCH = true;
+  return true;
+}
+
 std::unique_ptr<ASTConsumer>
 GenerateModuleAction::CreateASTConsumer(CompilerInstance &CI,
                                         StringRef InFile) {
diff --git a/lib/Index/IndexSymbol.cpp b/lib/Index/IndexSymbol.cpp
index b2342453a916..be847e762091 100644
--- a/lib/Index/IndexSymbol.cpp
+++ b/lib/Index/IndexSymbol.cpp
@@ -53,6 +53,7 @@ SymbolInfo index::getSymbolInfo(const Decl *D) {
   assert(D);
   SymbolInfo Info;
   Info.Kind = SymbolKind::Unknown;
+  Info.SubKind = SymbolSubKind::None;
   Info.Properties = SymbolPropertySet();
   Info.Lang = SymbolLanguage::C;
 
@@ -183,10 +184,16 @@ SymbolInfo index::getSymbolInfo(const Decl *D) {
       Info.Kind = SymbolKind::NamespaceAlias;
       Info.Lang = SymbolLanguage::CXX;
       break;
-    case Decl::CXXConstructor:
+    case Decl::CXXConstructor: {
       Info.Kind = SymbolKind::Constructor;
       Info.Lang = SymbolLanguage::CXX;
+      auto *CD = cast<CXXConstructorDecl>(D);
+      if (CD->isCopyConstructor())
+        Info.SubKind = SymbolSubKind::CXXCopyConstructor;
+      else if (CD->isMoveConstructor())
+        Info.SubKind = SymbolSubKind::CXXMoveConstructor;
       break;
+    }
     case Decl::CXXDestructor:
       Info.Kind = SymbolKind::Destructor;
       Info.Lang = SymbolLanguage::CXX;
@@ -363,6 +370,15 @@ StringRef index::getSymbolKindString(SymbolKind K) {
   llvm_unreachable("invalid symbol kind");
 }
 
+StringRef index::getSymbolSubKindString(SymbolSubKind K) {
+  switch (K) {
+  case SymbolSubKind::None: return "<none>";
+  case SymbolSubKind::CXXCopyConstructor: return "cxx-copy-ctor";
+  case SymbolSubKind::CXXMoveConstructor: return "cxx-move-ctor";
+  }
+  llvm_unreachable("invalid symbol subkind");
+}
+
 StringRef index::getSymbolLanguageString(SymbolLanguage K) {
   switch (K) {
   case SymbolLanguage::C: return "C";
diff --git a/lib/Lex/PPDirectives.cpp b/lib/Lex/PPDirectives.cpp
index 85504de3d15d..9661e7b13f72 100644
--- a/lib/Lex/PPDirectives.cpp
+++ b/lib/Lex/PPDirectives.cpp
@@ -1996,10 +1996,12 @@ void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
 
   // Ask HeaderInfo if we should enter this #include file.  If not, #including
   // this file will have no effect.
+  bool SkipHeader = false;
   if (ShouldEnter &&
       !HeaderInfo.ShouldEnterIncludeFile(*this, File, isImport,
                                          SuggestedModule.getModule())) {
     ShouldEnter = false;
+    SkipHeader = true;
     if (Callbacks)
       Callbacks->FileSkipped(*File, FilenameTok, FileCharacter);
   }
@@ -2008,6 +2010,14 @@ void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
   if (!ShouldEnter) {
     // If this is a module import, make it visible if needed.
     if (auto *M = SuggestedModule.getModule()) {
+      // When building a pch, -fmodule-name tells the compiler to textually
+      // include headers in the specified module. But it is possible that
+      // ShouldEnter is false because we are skipping the header. In that
+      // case, We are not importing the specified module.
+      if (SkipHeader && getLangOpts().CompilingPCH &&
+          M->getTopLevelModuleName() == getLangOpts().CurrentModule)
+        return;
+
       makeModuleVisible(M, HashLoc);
 
       if (IncludeTok.getIdentifierInfo()->getPPKeywordID() !=
@@ -2032,6 +2042,13 @@ void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
 
   // Determine if we're switching to building a new submodule, and which one.
   if (auto *M = SuggestedModule.getModule()) {
+    // When building a pch, -fmodule-name tells the compiler to textually
+    // include headers in the specified module. We are not building the
+    // specified module.
+    if (getLangOpts().CompilingPCH &&
+        M->getTopLevelModuleName() == getLangOpts().CurrentModule)
+      return;
+
     assert(!CurSubmodule && "should not have marked this as a module yet");
     CurSubmodule = M;
 
diff --git a/lib/Parse/ParseDecl.cpp b/lib/Parse/ParseDecl.cpp
index ba24adefe6b0..833d93e4548a 100644
--- a/lib/Parse/ParseDecl.cpp
+++ b/lib/Parse/ParseDecl.cpp
@@ -306,10 +306,11 @@ unsigned Parser::ParseAttributeArgsCommon(
 
     // Parse the non-empty comma-separated list of expressions.
     do {
-      bool ShouldEnter = attributeParsedArgsUnevaluated(*AttrName);
+      bool Uneval = attributeParsedArgsUnevaluated(*AttrName);
       EnterExpressionEvaluationContext Unevaluated(
-          Actions, Sema::Unevaluated, /*LambdaContextDecl=*/nullptr,
-          /*IsDecltype=*/false, ShouldEnter);
+          Actions, Uneval ? Sema::Unevaluated : Sema::ConstantEvaluated,
+          /*LambdaContextDecl=*/nullptr,
+          /*IsDecltype=*/false);
 
       ExprResult ArgExpr(
           Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression()));
diff --git a/lib/Parse/ParseInit.cpp b/lib/Parse/ParseInit.cpp
index 4a68942f6d2c..fa6b75daed92 100644
--- a/lib/Parse/ParseInit.cpp
+++ b/lib/Parse/ParseInit.cpp
@@ -404,6 +404,10 @@ ExprResult Parser::ParseBraceInitializer() {
     return Actions.ActOnInitList(LBraceLoc, None, ConsumeBrace());
   }
 
+  // Enter an appropriate expression evaluation context for an initializer list.
+  EnterExpressionEvaluationContext EnterContext(
+      Actions, EnterExpressionEvaluationContext::InitList);
+
   bool InitExprsOk = true;
 
   while (1) {
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp
index 9c902959233f..49208e20a49d 100644
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -1242,7 +1242,8 @@ bool Sema::CheckNeonBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     QualType RHSTy = RHS.get()->getType();
 
     llvm::Triple::ArchType Arch = Context.getTargetInfo().getTriple().getArch();
-    bool IsPolyUnsigned = Arch == llvm::Triple::aarch64;
+    bool IsPolyUnsigned = Arch == llvm::Triple::aarch64 ||
+                          Arch == llvm::Triple::aarch64_be;
     bool IsInt64Long =
         Context.getTargetInfo().getInt64Type() == TargetInfo::SignedLong;
     QualType EltTy =
diff --git a/lib/Sema/SemaDeclAttr.cpp b/lib/Sema/SemaDeclAttr.cpp
index f9b6a91a300f..c6a5bc74145c 100644
--- a/lib/Sema/SemaDeclAttr.cpp
+++ b/lib/Sema/SemaDeclAttr.cpp
@@ -32,6 +32,7 @@
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaInternal.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -890,34 +891,117 @@ static void handleLocksExcludedAttr(Sema &S, Decl *D,
                                Attr.getAttributeSpellingListIndex()));
 }
 
-static void handleEnableIfAttr(Sema &S, Decl *D, const AttributeList &Attr) {
-  S.Diag(Attr.getLoc(), diag::ext_clang_enable_if);
-
-  Expr *Cond = Attr.getArgAsExpr(0);
+static bool checkFunctionConditionAttr(Sema &S, Decl *D,
+                                       const AttributeList &Attr,
+                                       Expr *&Cond, StringRef &Msg) {
+  Cond = Attr.getArgAsExpr(0);
   if (!Cond->isTypeDependent()) {
     ExprResult Converted = S.PerformContextuallyConvertToBool(Cond);
     if (Converted.isInvalid())
-      return;
+      return false;
     Cond = Converted.get();
   }
 
-  StringRef Msg;
   if (!S.checkStringLiteralArgumentAttr(Attr, 1, Msg))
-    return;
+    return false;
+
+  if (Msg.empty())
+    Msg = "<no message provided>";
 
   SmallVector<PartialDiagnosticAt, 8> Diags;
   if (!Cond->isValueDependent() &&
       !Expr::isPotentialConstantExprUnevaluated(Cond, cast<FunctionDecl>(D),
                                                 Diags)) {
-    S.Diag(Attr.getLoc(), diag::err_enable_if_never_constant_expr);
+    S.Diag(Attr.getLoc(), diag::err_attr_cond_never_constant_expr)
+        << Attr.getName();
     for (const PartialDiagnosticAt &PDiag : Diags)
       S.Diag(PDiag.first, PDiag.second);
+    return false;
+  }
+  return true;
+}
+
+static void handleEnableIfAttr(Sema &S, Decl *D, const AttributeList &Attr) {
+  S.Diag(Attr.getLoc(), diag::ext_clang_enable_if);
+
+  Expr *Cond;
+  StringRef Msg;
+  if (checkFunctionConditionAttr(S, D, Attr, Cond, Msg))
+    D->addAttr(::new (S.Context)
+                   EnableIfAttr(Attr.getRange(), S.Context, Cond, Msg,
+                                Attr.getAttributeSpellingListIndex()));
+}
+
+namespace {
+/// Determines if a given Expr references any of the given function's
+/// ParmVarDecls, or the function's implicit `this` parameter (if applicable).
+class ArgumentDependenceChecker
+    : public RecursiveASTVisitor<ArgumentDependenceChecker> {
+#ifndef NDEBUG
+  const CXXRecordDecl *ClassType;
+#endif
+  llvm::SmallPtrSet<const ParmVarDecl *, 16> Parms;
+  bool Result;
+
+public:
+  ArgumentDependenceChecker(const FunctionDecl *FD) {
+#ifndef NDEBUG
+    if (const auto *MD = dyn_cast<CXXMethodDecl>(FD))
+      ClassType = MD->getParent();
+    else
+      ClassType = nullptr;
+#endif
+    Parms.insert(FD->param_begin(), FD->param_end());
+  }
+
+  bool referencesArgs(Expr *E) {
+    Result = false;
+    TraverseStmt(E);
+    return Result;
+  }
+
+  bool VisitCXXThisExpr(CXXThisExpr *E) {
+    assert(E->getType()->getPointeeCXXRecordDecl() == ClassType &&
+           "`this` doesn't refer to the enclosing class?");
+    Result = true;
+    return false;
+  }
+
+  bool VisitDeclRefExpr(DeclRefExpr *DRE) {
+    if (const auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl()))
+      if (Parms.count(PVD)) {
+        Result = true;
+        return false;
+      }
+    return true;
+  }
+};
+}
+
+static void handleDiagnoseIfAttr(Sema &S, Decl *D, const AttributeList &Attr) {
+  S.Diag(Attr.getLoc(), diag::ext_clang_diagnose_if);
+
+  Expr *Cond;
+  StringRef Msg;
+  if (!checkFunctionConditionAttr(S, D, Attr, Cond, Msg))
+    return;
+
+  StringRef DiagTypeStr;
+  if (!S.checkStringLiteralArgumentAttr(Attr, 2, DiagTypeStr))
+    return;
+
+  DiagnoseIfAttr::DiagnosticType DiagType;
+  if (!DiagnoseIfAttr::ConvertStrToDiagnosticType(DiagTypeStr, DiagType)) {
+    S.Diag(Attr.getArgAsExpr(2)->getLocStart(),
+           diag::err_diagnose_if_invalid_diagnostic_type);
     return;
   }
 
-  D->addAttr(::new (S.Context)
-             EnableIfAttr(Attr.getRange(), S.Context, Cond, Msg,
-                          Attr.getAttributeSpellingListIndex()));
+  auto *FD = cast<FunctionDecl>(D);
+  bool ArgDependent = ArgumentDependenceChecker(FD).referencesArgs(Cond);
+  D->addAttr(::new (S.Context) DiagnoseIfAttr(
+      Attr.getRange(), S.Context, Cond, Msg, DiagType, ArgDependent, FD,
+      Attr.getAttributeSpellingListIndex()));
 }
 
 static void handlePassObjectSizeAttr(Sema &S, Decl *D,
@@ -5682,6 +5766,9 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D,
   case AttributeList::AT_EnableIf:
     handleEnableIfAttr(S, D, Attr);
     break;
+  case AttributeList::AT_DiagnoseIf:
+    handleDiagnoseIfAttr(S, D, Attr);
+    break;
   case AttributeList::AT_ExtVectorType:
     handleExtVectorTypeAttr(S, scope, D, Attr);
     break;
diff --git a/lib/Sema/SemaDeclCXX.cpp b/lib/Sema/SemaDeclCXX.cpp
index a650621b573a..a70e16cce18c 100644
--- a/lib/Sema/SemaDeclCXX.cpp
+++ b/lib/Sema/SemaDeclCXX.cpp
@@ -5395,14 +5395,32 @@ static void ReferenceDllExportedMethods(Sema &S, CXXRecordDecl *Class) {
   }
 }
 
-static void checkForMultipleExportedDefaultConstructors(Sema &S, CXXRecordDecl *Class) {
+static void checkForMultipleExportedDefaultConstructors(Sema &S,
+                                                        CXXRecordDecl *Class) {
+  // Only the MS ABI has default constructor closures, so we don't need to do
+  // this semantic checking anywhere else.
+  if (!S.Context.getTargetInfo().getCXXABI().isMicrosoft())
+    return;
+
   CXXConstructorDecl *LastExportedDefaultCtor = nullptr;
   for (Decl *Member : Class->decls()) {
     // Look for exported default constructors.
     auto *CD = dyn_cast<CXXConstructorDecl>(Member);
-    if (!CD || !CD->isDefaultConstructor() || !CD->hasAttr<DLLExportAttr>())
+    if (!CD || !CD->isDefaultConstructor())
+      continue;
+    auto *Attr = CD->getAttr<DLLExportAttr>();
+    if (!Attr)
       continue;
 
+    // If the class is non-dependent, mark the default arguments as ODR-used so
+    // that we can properly codegen the constructor closure.
+    if (!Class->isDependentContext()) {
+      for (ParmVarDecl *PD : CD->parameters()) {
+        (void)S.CheckCXXDefaultArgExpr(Attr->getLocation(), CD, PD);
+        S.DiscardCleanupsInEvaluationContext();
+      }
+    }
+
     if (LastExportedDefaultCtor) {
       S.Diag(LastExportedDefaultCtor->getLocation(),
              diag::err_attribute_dll_ambiguous_default_ctor)
@@ -9135,6 +9153,16 @@ NamedDecl *Sema::BuildUsingDeclaration(Scope *S, AccessSpecifier AS,
   // invalid).
   if (R.empty() &&
       NameInfo.getName().getNameKind() != DeclarationName::CXXConstructorName) {
+    // HACK: Work around a bug in libstdc++'s detection of ::gets. Sometimes
+    // it will believe that glibc provides a ::gets in cases where it does not,
+    // and will try to pull it into namespace std with a using-declaration.
+    // Just ignore the using-declaration in that case.
+    auto *II = NameInfo.getName().getAsIdentifierInfo();
+    if (getLangOpts().CPlusPlus14 && II && II->isStr("gets") &&
+        CurContext->isStdNamespace() &&
+        isa<TranslationUnitDecl>(LookupContext) &&
+        getSourceManager().isInSystemHeader(UsingLoc))
+      return nullptr;
     if (TypoCorrection Corrected = CorrectTypo(
             R.getLookupNameInfo(), R.getLookupKind(), S, &SS,
             llvm::make_unique<UsingValidatorCCC>(
@@ -9828,9 +9856,14 @@ Sema::ComputeDefaultedDefaultCtorExceptionSpec(SourceLocation Loc,
   }
 
   // Field constructors.
-  for (const auto *F : ClassDecl->fields()) {
+  for (auto *F : ClassDecl->fields()) {
     if (F->hasInClassInitializer()) {
-      if (Expr *E = F->getInClassInitializer())
+      Expr *E = F->getInClassInitializer();
+      if (!E)
+        // FIXME: It's a little wasteful to build and throw away a
+        // CXXDefaultInitExpr here.
+        E = BuildCXXDefaultInitExpr(Loc, F).get();
+      if (E)
         ExceptSpec.CalledExpr(E);
     } else if (const RecordType *RecordTy
               = Context.getBaseElementType(F->getType())->getAs<RecordType>()) {
@@ -12291,6 +12324,10 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
   if (Field->getInClassInitializer())
     return CXXDefaultInitExpr::Create(Context, Loc, Field);
 
+  // If we might have already tried and failed to instantiate, don't try again.
+  if (Field->isInvalidDecl())
+    return ExprError();
+
   // Maybe we haven't instantiated the in-class initializer. Go check the
   // pattern FieldDecl to see if it has one.
   CXXRecordDecl *ParentRD = cast<CXXRecordDecl>(Field->getParent());
@@ -12320,8 +12357,11 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
     }
 
     if (InstantiateInClassInitializer(Loc, Field, Pattern,
-                                      getTemplateInstantiationArgs(Field)))
+                                      getTemplateInstantiationArgs(Field))) {
+      // Don't diagnose this again.
+      Field->setInvalidDecl();
       return ExprError();
+    }
     return CXXDefaultInitExpr::Create(Context, Loc, Field);
   }
 
@@ -12344,6 +12384,8 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
       << OutermostClass << Field;
   Diag(Field->getLocEnd(), diag::note_in_class_initializer_not_yet_parsed);
 
+  // Don't diagnose this again.
+  Field->setInvalidDecl();
   return ExprError();
 }
 
diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index 1509b22a9e5a..d62e8fd68b64 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp
@@ -342,6 +342,7 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, SourceLocation Loc,
   }
 
   // See if this is a deleted function.
+  SmallVector<DiagnoseIfAttr *, 4> DiagnoseIfWarnings;
   if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
     if (FD->isDeleted()) {
       auto *Ctor = dyn_cast<CXXConstructorDecl>(FD);
@@ -363,6 +364,12 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, SourceLocation Loc,
 
     if (getLangOpts().CUDA && !CheckCUDACall(Loc, FD))
       return true;
+
+    if (const DiagnoseIfAttr *A =
+            checkArgIndependentDiagnoseIf(FD, DiagnoseIfWarnings)) {
+      emitDiagnoseIfDiagnostic(Loc, A);
+      return true;
+    }
   }
 
   // [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions
@@ -377,6 +384,10 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, SourceLocation Loc,
     Diag(D->getLocation(), diag::note_entity_declared_at) << D;
     return true;
   }
+
+  for (const auto *W : DiagnoseIfWarnings)
+    emitDiagnoseIfDiagnostic(Loc, W);
+
   DiagnoseAvailabilityOfDecl(*this, D, Loc, UnknownObjCClass,
                              ObjCPropertyAccess);
 
@@ -5154,12 +5165,40 @@ static FunctionDecl *rewriteBuiltinFunctionDecl(Sema *Sema, ASTContext &Context,
   return OverloadDecl;
 }
 
-static bool isNumberOfArgsValidForCall(Sema &S, const FunctionDecl *Callee,
-                                       std::size_t NumArgs) {
-  if (S.TooManyArguments(Callee->getNumParams(), NumArgs,
-                         /*PartialOverloading=*/false))
-    return Callee->isVariadic();
-  return Callee->getMinRequiredArguments() <= NumArgs;
+static void checkDirectCallValidity(Sema &S, const Expr *Fn,
+                                    FunctionDecl *Callee,
+                                    MultiExprArg ArgExprs) {
+  // `Callee` (when called with ArgExprs) may be ill-formed. enable_if (and
+  // similar attributes) really don't like it when functions are called with an
+  // invalid number of args.
+  if (S.TooManyArguments(Callee->getNumParams(), ArgExprs.size(),
+                         /*PartialOverloading=*/false) &&
+      !Callee->isVariadic())
+    return;
+  if (Callee->getMinRequiredArguments() > ArgExprs.size())
+    return;
+
+  if (const EnableIfAttr *Attr = S.CheckEnableIf(Callee, ArgExprs, true)) {
+    S.Diag(Fn->getLocStart(),
+           isa<CXXMethodDecl>(Callee)
+               ? diag::err_ovl_no_viable_member_function_in_call
+               : diag::err_ovl_no_viable_function_in_call)
+        << Callee << Callee->getSourceRange();
+    S.Diag(Callee->getLocation(),
+           diag::note_ovl_candidate_disabled_by_function_cond_attr)
+        << Attr->getCond()->getSourceRange() << Attr->getMessage();
+    return;
+  }
+
+  SmallVector<DiagnoseIfAttr *, 4> Nonfatal;
+  if (const DiagnoseIfAttr *Attr = S.checkArgDependentDiagnoseIf(
+          Callee, ArgExprs, Nonfatal, /*MissingImplicitThis=*/true)) {
+    S.emitDiagnoseIfDiagnostic(Fn->getLocStart(), Attr);
+    return;
+  }
+
+  for (const auto *W : Nonfatal)
+    S.emitDiagnoseIfDiagnostic(Fn->getLocStart(), W);
 }
 
 /// ActOnCallExpr - Handle a call to Fn with the specified array of arguments.
@@ -5294,26 +5333,8 @@ ExprResult Sema::ActOnCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
 
     if (getLangOpts().OpenCL && checkOpenCLDisabledDecl(*FD, *Fn))
       return ExprError();
-    
-    // CheckEnableIf assumes that the we're passing in a sane number of args for
-    // FD, but that doesn't always hold true here. This is because, in some
-    // cases, we'll emit a diag about an ill-formed function call, but then
-    // we'll continue on as if the function call wasn't ill-formed. So, if the
-    // number of args looks incorrect, don't do enable_if checks; we should've
-    // already emitted an error about the bad call.
-    if (FD->hasAttr<EnableIfAttr>() &&
-        isNumberOfArgsValidForCall(*this, FD, ArgExprs.size())) {
-      if (const EnableIfAttr *Attr = CheckEnableIf(FD, ArgExprs, true)) {
-        Diag(Fn->getLocStart(),
-             isa<CXXMethodDecl>(FD)
-                 ? diag::err_ovl_no_viable_member_function_in_call
-                 : diag::err_ovl_no_viable_function_in_call)
-            << FD << FD->getSourceRange();
-        Diag(FD->getLocation(),
-             diag::note_ovl_candidate_disabled_by_enable_if_attr)
-            << Attr->getCond()->getSourceRange() << Attr->getMessage();
-      }
-    }
+
+    checkDirectCallValidity(*this, Fn, FD, ArgExprs);
   }
 
   return BuildResolvedCallExpr(Fn, NDecl, LParenLoc, ArgExprs, RParenLoc,
@@ -13097,8 +13118,16 @@ void Sema::PopExpressionEvaluationContext() {
         //   evaluate [...] a lambda-expression.
         D = diag::err_lambda_in_constant_expression;
       }
-      for (const auto *L : Rec.Lambdas)
-        Diag(L->getLocStart(), D);
+
+      // C++1z allows lambda expressions as core constant expressions.
+      // FIXME: In C++1z, reinstate the restrictions on lambda expressions (CWG
+      // 1607) from appearing within template-arguments and array-bounds that
+      // are part of function-signatures.  Be mindful that P0315 (Lambdas in
+      // unevaluated contexts) might lift some of these restrictions in a 
+      // future version.
+      if (Rec.Context != ConstantEvaluated || !getLangOpts().CPlusPlus1z)
+        for (const auto *L : Rec.Lambdas)
+          Diag(L->getLocStart(), D);
     } else {
       // Mark the capture expressions odr-used. This was deferred
       // during lambda expression creation.
@@ -13150,41 +13179,63 @@ ExprResult Sema::HandleExprEvaluationContextForTypeof(Expr *E) {
   return TransformToPotentiallyEvaluated(E);
 }
 
-static bool IsPotentiallyEvaluatedContext(Sema &SemaRef) {
-  // Do not mark anything as "used" within a dependent context; wait for
-  // an instantiation.
-  if (SemaRef.CurContext->isDependentContext())
-    return false;
-
+/// Are we within a context in which some evaluation could be performed (be it
+/// constant evaluation or runtime evaluation)? Sadly, this notion is not quite
+/// captured by C++'s idea of an "unevaluated context".
+static bool isEvaluatableContext(Sema &SemaRef) {
   switch (SemaRef.ExprEvalContexts.back().Context) {
     case Sema::Unevaluated:
     case Sema::UnevaluatedAbstract:
-      // We are in an expression that is not potentially evaluated; do nothing.
-      // (Depending on how you read the standard, we actually do need to do
-      // something here for null pointer constants, but the standard's
-      // definition of a null pointer constant is completely crazy.)
+    case Sema::DiscardedStatement:
+      // Expressions in this context are never evaluated.
+      return false;
+
+    case Sema::UnevaluatedList:
+    case Sema::ConstantEvaluated:
+    case Sema::PotentiallyEvaluated:
+      // Expressions in this context could be evaluated.
+      return true;
+
+    case Sema::PotentiallyEvaluatedIfUsed:
+      // Referenced declarations will only be used if the construct in the
+      // containing expression is used, at which point we'll be given another
+      // turn to mark them.
       return false;
+  }
+  llvm_unreachable("Invalid context");
+}
 
+/// Are we within a context in which references to resolved functions or to
+/// variables result in odr-use?
+static bool isOdrUseContext(Sema &SemaRef, bool SkipDependentUses = true) {
+  // An expression in a template is not really an expression until it's been
+  // instantiated, so it doesn't trigger odr-use.
+  if (SkipDependentUses && SemaRef.CurContext->isDependentContext())
+    return false;
+
+  switch (SemaRef.ExprEvalContexts.back().Context) {
+    case Sema::Unevaluated:
+    case Sema::UnevaluatedList:
+    case Sema::UnevaluatedAbstract:
     case Sema::DiscardedStatement:
-      // These are technically a potentially evaluated but they have the effect
-      // of suppressing use marking.
       return false;
 
     case Sema::ConstantEvaluated:
     case Sema::PotentiallyEvaluated:
-      // We are in a potentially evaluated expression (or a constant-expression
-      // in C++03); we need to do implicit template instantiation, implicitly
-      // define class members, and mark most declarations as used.
       return true;
 
     case Sema::PotentiallyEvaluatedIfUsed:
-      // Referenced declarations will only be used if the construct in the
-      // containing expression is used.
       return false;
   }
   llvm_unreachable("Invalid context");
 }
 
+static bool isImplicitlyDefinableConstexprFunction(FunctionDecl *Func) {
+  CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(Func);
+  return Func->isConstexpr() &&
+         (Func->isImplicitlyInstantiable() || (MD && !MD->isUserProvided()));
+}
+
 /// \brief Mark a function referenced, and check whether it is odr-used
 /// (C++ [basic.def.odr]p2, C99 6.9p3)
 void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
@@ -13200,7 +13251,7 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
   //
   // We (incorrectly) mark overload resolution as an unevaluated context, so we
   // can just check that here.
-  bool OdrUse = MightBeOdrUse && IsPotentiallyEvaluatedContext(*this);
+  bool OdrUse = MightBeOdrUse && isOdrUseContext(*this);
 
   // Determine whether we require a function definition to exist, per
   // C++11 [temp.inst]p3:
@@ -13209,27 +13260,11 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
   //   specialization is implicitly instantiated when the specialization is
   //   referenced in a context that requires a function definition to exist.
   //
-  // We consider constexpr function templates to be referenced in a context
-  // that requires a definition to exist whenever they are referenced.
-  //
-  // FIXME: This instantiates constexpr functions too frequently. If this is
-  // really an unevaluated context (and we're not just in the definition of a
-  // function template or overload resolution or other cases which we
-  // incorrectly consider to be unevaluated contexts), and we're not in a
-  // subexpression which we actually need to evaluate (for instance, a
-  // template argument, array bound or an expression in a braced-init-list),
-  // we are not permitted to instantiate this constexpr function definition.
-  //
-  // FIXME: This also implicitly defines special members too frequently. They
-  // are only supposed to be implicitly defined if they are odr-used, but they
-  // are not odr-used from constant expressions in unevaluated contexts.
-  // However, they cannot be referenced if they are deleted, and they are
-  // deleted whenever the implicit definition of the special member would
-  // fail (with very few exceptions).
-  CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(Func);
+  // That is either when this is an odr-use, or when a usage of a constexpr
+  // function occurs within an evaluatable context.
   bool NeedDefinition =
-      OdrUse || (Func->isConstexpr() && (Func->isImplicitlyInstantiable() ||
-                                         (MD && !MD->isUserProvided())));
+      OdrUse || (isEvaluatableContext(*this) &&
+                 isImplicitlyDefinableConstexprFunction(Func));
 
   // C++14 [temp.expl.spec]p6:
   //   If a template [...] is explicitly specialized then that specialization
@@ -14123,47 +14158,11 @@ static void DoMarkVarDeclReferenced(Sema &SemaRef, SourceLocation Loc,
   Var->setReferenced();
 
   TemplateSpecializationKind TSK = Var->getTemplateSpecializationKind();
-  bool MarkODRUsed = true;
-
-  // If the context is not potentially evaluated, this is not an odr-use and
-  // does not trigger instantiation.
-  if (!IsPotentiallyEvaluatedContext(SemaRef)) {
-    if (SemaRef.isUnevaluatedContext())
-      return;
-
-    // If we don't yet know whether this context is going to end up being an
-    // evaluated context, and we're referencing a variable from an enclosing
-    // scope, add a potential capture.
-    //
-    // FIXME: Is this necessary? These contexts are only used for default
-    // arguments, where local variables can't be used.
-    const bool RefersToEnclosingScope =
-        (SemaRef.CurContext != Var->getDeclContext() &&
-         Var->getDeclContext()->isFunctionOrMethod() && Var->hasLocalStorage());
-    if (RefersToEnclosingScope) {
-      if (LambdaScopeInfo *const LSI =
-              SemaRef.getCurLambda(/*IgnoreCapturedRegions=*/true)) {
-        // If a variable could potentially be odr-used, defer marking it so
-        // until we finish analyzing the full expression for any
-        // lvalue-to-rvalue
-        // or discarded value conversions that would obviate odr-use.
-        // Add it to the list of potential captures that will be analyzed
-        // later (ActOnFinishFullExpr) for eventual capture and odr-use marking
-        // unless the variable is a reference that was initialized by a constant
-        // expression (this will never need to be captured or odr-used).
-        assert(E && "Capture variable should be used in an expression.");
-        if (!Var->getType()->isReferenceType() ||
-            !IsVariableNonDependentAndAConstantExpression(Var, SemaRef.Context))
-          LSI->addPotentialCapture(E->IgnoreParens());
-      }
-    }
 
-    if (!isTemplateInstantiation(TSK))
-      return;
-
-    // Instantiate, but do not mark as odr-used, variable templates.
-    MarkODRUsed = false;
-  }
+  bool OdrUseContext = isOdrUseContext(SemaRef);
+  bool NeedDefinition =
+      OdrUseContext || (isEvaluatableContext(SemaRef) &&
+                        Var->isUsableInConstantExpressions(SemaRef.Context));
 
   VarTemplateSpecializationDecl *VarSpec =
       dyn_cast<VarTemplateSpecializationDecl>(Var);
@@ -14173,14 +14172,15 @@ static void DoMarkVarDeclReferenced(Sema &SemaRef, SourceLocation Loc,
   // If this might be a member specialization of a static data member, check
   // the specialization is visible. We already did the checks for variable
   // template specializations when we created them.
-  if (TSK != TSK_Undeclared && !isa<VarTemplateSpecializationDecl>(Var))
+  if (NeedDefinition && TSK != TSK_Undeclared &&
+      !isa<VarTemplateSpecializationDecl>(Var))
     SemaRef.checkSpecializationVisibility(Loc, Var);
 
   // Perform implicit instantiation of static data members, static data member
   // templates of class templates, and variable template specializations. Delay
   // instantiations of variable templates, except for those that could be used
   // in a constant expression.
-  if (isTemplateInstantiation(TSK)) {
+  if (NeedDefinition && isTemplateInstantiation(TSK)) {
     bool TryInstantiating = TSK == TSK_ImplicitInstantiation;
 
     if (TryInstantiating && !isa<VarTemplateSpecializationDecl>(Var)) {
@@ -14219,9 +14219,6 @@ static void DoMarkVarDeclReferenced(Sema &SemaRef, SourceLocation Loc,
     }
   }
 
-  if (!MarkODRUsed)
-    return;
-
   // Per C++11 [basic.def.odr], a variable is odr-used "unless it satisfies
   // the requirements for appearing in a constant expression (5.19) and, if
   // it is an object, the lvalue-to-rvalue conversion (4.1)
@@ -14230,14 +14227,41 @@ static void DoMarkVarDeclReferenced(Sema &SemaRef, SourceLocation Loc,
   // Note that we use the C++11 definition everywhere because nothing in
   // C++03 depends on whether we get the C++03 version correct. The second
   // part does not apply to references, since they are not objects.
-  if (E && IsVariableAConstantExpression(Var, SemaRef.Context)) {
+  if (OdrUseContext && E &&
+      IsVariableAConstantExpression(Var, SemaRef.Context)) {
     // A reference initialized by a constant expression can never be
     // odr-used, so simply ignore it.
     if (!Var->getType()->isReferenceType())
       SemaRef.MaybeODRUseExprs.insert(E);
-  } else
+  } else if (OdrUseContext) {
     MarkVarDeclODRUsed(Var, Loc, SemaRef,
                        /*MaxFunctionScopeIndex ptr*/ nullptr);
+  } else if (isOdrUseContext(SemaRef, /*SkipDependentUses*/false)) {
+    // If this is a dependent context, we don't need to mark variables as
+    // odr-used, but we may still need to track them for lambda capture.
+    // FIXME: Do we also need to do this inside dependent typeid expressions
+    // (which are modeled as unevaluated at this point)?
+    const bool RefersToEnclosingScope =
+        (SemaRef.CurContext != Var->getDeclContext() &&
+         Var->getDeclContext()->isFunctionOrMethod() && Var->hasLocalStorage());
+    if (RefersToEnclosingScope) {
+      if (LambdaScopeInfo *const LSI =
+              SemaRef.getCurLambda(/*IgnoreCapturedRegions=*/true)) {
+        // If a variable could potentially be odr-used, defer marking it so
+        // until we finish analyzing the full expression for any
+        // lvalue-to-rvalue
+        // or discarded value conversions that would obviate odr-use.
+        // Add it to the list of potential captures that will be analyzed
+        // later (ActOnFinishFullExpr) for eventual capture and odr-use marking
+        // unless the variable is a reference that was initialized by a constant
+        // expression (this will never need to be captured or odr-used).
+        assert(E && "Capture variable should be used in an expression.");
+        if (!Var->getType()->isReferenceType() ||
+            !IsVariableNonDependentAndAConstantExpression(Var, SemaRef.Context))
+          LSI->addPotentialCapture(E->IgnoreParens());
+      }
+    }
+  }
 }
 
 /// \brief Mark a variable referenced, and check whether it is odr-used
@@ -14333,9 +14357,13 @@ void Sema::MarkAnyDeclReferenced(SourceLocation Loc, Decl *D,
 }
 
 namespace {
-  // Mark all of the declarations referenced
+  // Mark all of the declarations used by a type as referenced.
   // FIXME: Not fully implemented yet! We need to have a better understanding
-  // of when we're entering
+  // of when we're entering a context we should not recurse into.
+  // FIXME: This is and EvaluatedExprMarker are more-or-less equivalent to
+  // TreeTransforms rebuilding the type in a new context. Rather than
+  // duplicating the TreeTransform logic, we should consider reusing it here.
+  // Currently that causes problems when rebuilding LambdaExprs.
   class MarkReferencedDecls : public RecursiveASTVisitor<MarkReferencedDecls> {
     Sema &S;
     SourceLocation Loc;
@@ -14346,33 +14374,28 @@ namespace {
     MarkReferencedDecls(Sema &S, SourceLocation Loc) : S(S), Loc(Loc) { }
 
     bool TraverseTemplateArgument(const TemplateArgument &Arg);
-    bool TraverseRecordType(RecordType *T);
   };
 }
 
 bool MarkReferencedDecls::TraverseTemplateArgument(
     const TemplateArgument &Arg) {
-  if (Arg.getKind() == TemplateArgument::Declaration) {
-    if (Decl *D = Arg.getAsDecl())
-      S.MarkAnyDeclReferenced(Loc, D, true);
+  {
+    // A non-type template argument is a constant-evaluated context.
+    EnterExpressionEvaluationContext Evaluated(S, Sema::ConstantEvaluated);
+    if (Arg.getKind() == TemplateArgument::Declaration) {
+      if (Decl *D = Arg.getAsDecl())
+        S.MarkAnyDeclReferenced(Loc, D, true);
+    } else if (Arg.getKind() == TemplateArgument::Expression) {
+      S.MarkDeclarationsReferencedInExpr(Arg.getAsExpr(), false);
+    }
   }
 
   return Inherited::TraverseTemplateArgument(Arg);
 }
 
-bool MarkReferencedDecls::TraverseRecordType(RecordType *T) {
-  if (ClassTemplateSpecializationDecl *Spec
-                  = dyn_cast<ClassTemplateSpecializationDecl>(T->getDecl())) {
-    const TemplateArgumentList &Args = Spec->getTemplateArgs();
-    return TraverseTemplateArguments(Args.data(), Args.size());
-  }
-
-  return true;
-}
-
 void Sema::MarkDeclarationsReferencedInType(SourceLocation Loc, QualType T) {
   MarkReferencedDecls Marker(*this, Loc);
-  Marker.TraverseType(Context.getCanonicalType(T));
+  Marker.TraverseType(T);
 }
 
 namespace {
@@ -14479,6 +14502,7 @@ bool Sema::DiagRuntimeBehavior(SourceLocation Loc, const Stmt *Statement,
                                const PartialDiagnostic &PD) {
   switch (ExprEvalContexts.back().Context) {
   case Unevaluated:
+  case UnevaluatedList:
   case UnevaluatedAbstract:
   case DiscardedStatement:
     // The argument will never be evaluated, so don't complain.
diff --git a/lib/Sema/SemaExprMember.cpp b/lib/Sema/SemaExprMember.cpp
index 806a3d813ee8..c9aa99ee383c 100644
--- a/lib/Sema/SemaExprMember.cpp
+++ b/lib/Sema/SemaExprMember.cpp
@@ -134,6 +134,7 @@ static IMAKind ClassifyImplicitMemberAccess(Sema &SemaRef,
   assert(!AbstractInstanceResult);
   switch (SemaRef.ExprEvalContexts.back().Context) {
   case Sema::Unevaluated:
+  case Sema::UnevaluatedList:
     if (isField && SemaRef.getLangOpts().CPlusPlus11)
       AbstractInstanceResult = IMA_Field_Uneval_Context;
     break;
diff --git a/lib/Sema/SemaInit.cpp b/lib/Sema/SemaInit.cpp
index befee05713e0..45eff5ee6b62 100644
--- a/lib/Sema/SemaInit.cpp
+++ b/lib/Sema/SemaInit.cpp
@@ -6561,6 +6561,13 @@ InitializationSequence::Perform(Sema &S,
     break;
   }
 
+  // Promote from an unevaluated context to an unevaluated list context in
+  // C++11 list-initialization; we need to instantiate entities usable in
+  // constant expressions here in order to perform narrowing checks =(
+  EnterExpressionEvaluationContext Evaluated(
+      S, EnterExpressionEvaluationContext::InitList,
+      CurInit.get() && isa<InitListExpr>(CurInit.get()));
+
   // C++ [class.abstract]p2:
   //   no objects of an abstract class can be created except as subobjects
   //   of a class derived from it
diff --git a/lib/Sema/SemaLambda.cpp b/lib/Sema/SemaLambda.cpp
index 3bae69164ffd..a0d574915eba 100644
--- a/lib/Sema/SemaLambda.cpp
+++ b/lib/Sema/SemaLambda.cpp
@@ -1274,7 +1274,7 @@ static void addFunctionPointerConversion(Sema &S,
                                 ConvTy, 
                                 ConvTSI,
                                 /*isInline=*/true, /*isExplicit=*/false,
-                                /*isConstexpr=*/false, 
+                                /*isConstexpr=*/S.getLangOpts().CPlusPlus1z, 
                                 CallOperator->getBody()->getLocEnd());
   Conversion->setAccess(AS_public);
   Conversion->setImplicit(true);
@@ -1565,6 +1565,7 @@ ExprResult Sema::BuildLambdaExpr(SourceLocation StartLoc, SourceLocation EndLoc,
     //   A lambda-expression shall not appear in an unevaluated operand
     //   (Clause 5).
     case Unevaluated:
+    case UnevaluatedList:
     case UnevaluatedAbstract:
     // C++1y [expr.const]p2:
     //   A conditional-expression e is a core constant expression unless the
diff --git a/lib/Sema/SemaLookup.cpp b/lib/Sema/SemaLookup.cpp
index 38a7b8c127cc..883e2ae264e9 100644
--- a/lib/Sema/SemaLookup.cpp
+++ b/lib/Sema/SemaLookup.cpp
@@ -2960,6 +2960,7 @@ Sema::SpecialMemberOverloadResult *Sema::LookupSpecialMember(CXXRecordDecl *RD,
     if (CXXMethodDecl *M = dyn_cast<CXXMethodDecl>(Cand->getUnderlyingDecl())) {
       if (SM == CXXCopyAssignment || SM == CXXMoveAssignment)
         AddMethodCandidate(M, Cand, RD, ThisTy, Classification,
+                           /*ThisArg=*/nullptr,
                            llvm::makeArrayRef(&Arg, NumArgs), OCS, true);
       else if (CtorInfo)
         AddOverloadCandidate(CtorInfo.Constructor, CtorInfo.FoundDecl,
@@ -2972,7 +2973,7 @@ Sema::SpecialMemberOverloadResult *Sema::LookupSpecialMember(CXXRecordDecl *RD,
       if (SM == CXXCopyAssignment || SM == CXXMoveAssignment)
         AddMethodTemplateCandidate(
             Tmpl, Cand, RD, nullptr, ThisTy, Classification,
-            llvm::makeArrayRef(&Arg, NumArgs), OCS, true);
+            /*ThisArg=*/nullptr, llvm::makeArrayRef(&Arg, NumArgs), OCS, true);
       else if (CtorInfo)
         AddTemplateOverloadCandidate(
             CtorInfo.ConstructorTmpl, CtorInfo.FoundDecl, nullptr,
diff --git a/lib/Sema/SemaOverload.cpp b/lib/Sema/SemaOverload.cpp
index 33574b9aec35..41f4fa746fc6 100644
--- a/lib/Sema/SemaOverload.cpp
+++ b/lib/Sema/SemaOverload.cpp
@@ -29,6 +29,7 @@
 #include "clang/Sema/Template.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -589,7 +590,6 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
   Result.Result = static_cast<unsigned>(TDK);
   Result.HasDiagnostic = false;
   switch (TDK) {
-  case Sema::TDK_Success:
   case Sema::TDK_Invalid:
   case Sema::TDK_InstantiationDepth:
   case Sema::TDK_TooManyArguments:
@@ -645,6 +645,10 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
       Result.HasDiagnostic = true;
     }
     break;
+
+  case Sema::TDK_Success:
+  case Sema::TDK_NonDependentConversionFailure:
+    llvm_unreachable("not a deduction failure");
   }
 
   return Result;
@@ -660,6 +664,7 @@ void DeductionFailureInfo::Destroy() {
   case Sema::TDK_TooFewArguments:
   case Sema::TDK_InvalidExplicitArguments:
   case Sema::TDK_CUDATargetMismatch:
+  case Sema::TDK_NonDependentConversionFailure:
     break;
 
   case Sema::TDK_Inconsistent:
@@ -704,6 +709,7 @@ TemplateParameter DeductionFailureInfo::getTemplateParameter() {
   case Sema::TDK_DeducedMismatchNested:
   case Sema::TDK_NonDeducedMismatch:
   case Sema::TDK_CUDATargetMismatch:
+  case Sema::TDK_NonDependentConversionFailure:
     return TemplateParameter();
 
   case Sema::TDK_Incomplete:
@@ -735,6 +741,7 @@ TemplateArgumentList *DeductionFailureInfo::getTemplateArgumentList() {
   case Sema::TDK_Underqualified:
   case Sema::TDK_NonDeducedMismatch:
   case Sema::TDK_CUDATargetMismatch:
+  case Sema::TDK_NonDependentConversionFailure:
     return nullptr;
 
   case Sema::TDK_DeducedMismatch:
@@ -763,6 +770,7 @@ const TemplateArgument *DeductionFailureInfo::getFirstArg() {
   case Sema::TDK_InvalidExplicitArguments:
   case Sema::TDK_SubstitutionFailure:
   case Sema::TDK_CUDATargetMismatch:
+  case Sema::TDK_NonDependentConversionFailure:
     return nullptr;
 
   case Sema::TDK_Inconsistent:
@@ -791,6 +799,7 @@ const TemplateArgument *DeductionFailureInfo::getSecondArg() {
   case Sema::TDK_InvalidExplicitArguments:
   case Sema::TDK_SubstitutionFailure:
   case Sema::TDK_CUDATargetMismatch:
+  case Sema::TDK_NonDependentConversionFailure:
     return nullptr;
 
   case Sema::TDK_Inconsistent:
@@ -821,8 +830,8 @@ llvm::Optional<unsigned> DeductionFailureInfo::getCallArgIndex() {
 
 void OverloadCandidateSet::destroyCandidates() {
   for (iterator i = begin(), e = end(); i != e; ++i) {
-    for (unsigned ii = 0, ie = i->NumConversions; ii != ie; ++ii)
-      i->Conversions[ii].~ImplicitConversionSequence();
+    for (auto &C : i->Conversions)
+      C.~ImplicitConversionSequence();
     if (!i->Viable && i->FailureKind == ovl_fail_bad_deduction)
       i->DeductionFailure.Destroy();
   }
@@ -830,12 +839,20 @@ void OverloadCandidateSet::destroyCandidates() {
 
 void OverloadCandidateSet::clear() {
   destroyCandidates();
-  ConversionSequenceAllocator.Reset();
-  NumInlineSequences = 0;
+  // DiagnoseIfAttrs are just pointers, so we don't need to destroy them.
+  SlabAllocator.Reset();
+  NumInlineBytesUsed = 0;
   Candidates.clear();
   Functions.clear();
 }
 
+DiagnoseIfAttr **
+OverloadCandidateSet::addDiagnoseIfComplaints(ArrayRef<DiagnoseIfAttr *> CA) {
+  auto *DIA = slabAllocate<DiagnoseIfAttr *>(CA.size());
+  std::uninitialized_copy(CA.begin(), CA.end(), DIA);
+  return DIA;
+}
+
 namespace {
   class UnbridgedCastsSet {
     struct Entry {
@@ -5814,6 +5831,28 @@ static bool IsAcceptableNonMemberOperatorCandidate(ASTContext &Context,
   return false;
 }
 
+static void initDiagnoseIfComplaint(Sema &S, OverloadCandidateSet &CandidateSet,
+                                    OverloadCandidate &Candidate,
+                                    FunctionDecl *Function,
+                                    ArrayRef<Expr *> Args,
+                                    bool MissingImplicitThis = false,
+                                    Expr *ExplicitThis = nullptr) {
+  SmallVector<DiagnoseIfAttr *, 8> Results;
+  if (DiagnoseIfAttr *DIA = S.checkArgDependentDiagnoseIf(
+          Function, Args, Results, MissingImplicitThis, ExplicitThis)) {
+    Results.clear();
+    Results.push_back(DIA);
+  }
+
+  Candidate.NumTriggeredDiagnoseIfs = Results.size();
+  if (Results.empty())
+    Candidate.DiagnoseIfInfo = nullptr;
+  else if (Results.size() == 1)
+    Candidate.DiagnoseIfInfo = Results[0];
+  else
+    Candidate.DiagnoseIfInfo = CandidateSet.addDiagnoseIfComplaints(Results);
+}
+
 /// AddOverloadCandidate - Adds the given function to the set of
 /// candidate functions, using the given function call arguments.  If
 /// @p SuppressUserConversions, then don't allow user-defined
@@ -5829,7 +5868,8 @@ Sema::AddOverloadCandidate(FunctionDecl *Function,
                            OverloadCandidateSet &CandidateSet,
                            bool SuppressUserConversions,
                            bool PartialOverloading,
-                           bool AllowExplicit) {
+                           bool AllowExplicit,
+                           ConversionSequenceList EarlyConversions) {
   const FunctionProtoType *Proto
     = dyn_cast<FunctionProtoType>(Function->getType()->getAs<FunctionType>());
   assert(Proto && "Functions without a prototype cannot be overloaded");
@@ -5845,10 +5885,11 @@ Sema::AddOverloadCandidate(FunctionDecl *Function,
       // function, e.g., X::f(). We use an empty type for the implied
       // object argument (C++ [over.call.func]p3), and the acting context
       // is irrelevant.
-      AddMethodCandidate(Method, FoundDecl, Method->getParent(),
-                         QualType(), Expr::Classification::makeSimpleLValue(),
-                         Args, CandidateSet, SuppressUserConversions,
-                         PartialOverloading);
+      AddMethodCandidate(Method, FoundDecl, Method->getParent(), QualType(),
+                         Expr::Classification::makeSimpleLValue(),
+                         /*ThisArg=*/nullptr, Args, CandidateSet,
+                         SuppressUserConversions, PartialOverloading,
+                         EarlyConversions);
       return;
     }
     // We treat a constructor like a non-member function, since its object
@@ -5881,7 +5922,8 @@ Sema::AddOverloadCandidate(FunctionDecl *Function,
   EnterExpressionEvaluationContext Unevaluated(*this, Sema::Unevaluated);
 
   // Add this candidate
-  OverloadCandidate &Candidate = CandidateSet.addCandidate(Args.size());
+  OverloadCandidate &Candidate =
+      CandidateSet.addCandidate(Args.size(), EarlyConversions);
   Candidate.FoundDecl = FoundDecl;
   Candidate.Function = Function;
   Candidate.Viable = true;
@@ -5945,7 +5987,10 @@ Sema::AddOverloadCandidate(FunctionDecl *Function,
   // Determine the implicit conversion sequences for each of the
   // arguments.
   for (unsigned ArgIdx = 0; ArgIdx < Args.size(); ++ArgIdx) {
-    if (ArgIdx < NumParams) {
+    if (Candidate.Conversions[ArgIdx].isInitialized()) {
+      // We already formed a conversion sequence for this parameter during
+      // template argument deduction.
+    } else if (ArgIdx < NumParams) {
       // (C++ 13.3.2p3): for F to be a viable function, there shall
       // exist for each argument an implicit conversion sequence
       // (13.3.3.1) that converts that argument to the corresponding
@@ -5971,6 +6016,31 @@ Sema::AddOverloadCandidate(FunctionDecl *Function,
     }
   }
 
+  // C++ [over.best.ics]p4+: (proposed DR resolution)
+  //   If the target is the first parameter of an inherited constructor when
+  //   constructing an object of type C with an argument list that has exactly
+  //   one expression, an implicit conversion sequence cannot be formed if C is
+  //   reference-related to the type that the argument would have after the
+  //   application of the user-defined conversion (if any) and before the final
+  //   standard conversion sequence. 
+  auto *Shadow = dyn_cast<ConstructorUsingShadowDecl>(FoundDecl.getDecl());
+  if (Shadow && Args.size() == 1 && !isa<InitListExpr>(Args.front())) {
+    bool DerivedToBase, ObjCConversion, ObjCLifetimeConversion;
+    QualType ConvertedArgumentType = Args.front()->getType();
+    if (Candidate.Conversions[0].isUserDefined())
+      ConvertedArgumentType =
+          Candidate.Conversions[0].UserDefined.After.getFromType();
+    if (CompareReferenceRelationship(Args.front()->getLocStart(),
+                                     Context.getRecordType(Shadow->getParent()),
+                                     ConvertedArgumentType, DerivedToBase,
+                                     ObjCConversion,
+                                     ObjCLifetimeConversion) >= Ref_Related) {
+      Candidate.Viable = false;
+      Candidate.FailureKind = ovl_fail_inhctor_slice;
+      return;
+    }
+  }
+
   if (EnableIfAttr *FailedAttr = CheckEnableIf(Function, Args)) {
     Candidate.Viable = false;
     Candidate.FailureKind = ovl_fail_enable_if;
@@ -5983,6 +6053,8 @@ Sema::AddOverloadCandidate(FunctionDecl *Function,
     Candidate.FailureKind = ovl_fail_ext_disabled;
     return;
   }
+
+  initDiagnoseIfComplaint(*this, CandidateSet, Candidate, Function, Args);
 }
 
 ObjCMethodDecl *
@@ -6095,66 +6167,87 @@ getOrderedEnableIfAttrs(const FunctionDecl *Function) {
   return Result;
 }
 
-EnableIfAttr *Sema::CheckEnableIf(FunctionDecl *Function, ArrayRef<Expr *> Args,
-                                  bool MissingImplicitThis) {
-  auto EnableIfAttrs = getOrderedEnableIfAttrs(Function);
-  if (EnableIfAttrs.empty())
-    return nullptr;
-
-  SFINAETrap Trap(*this);
-  SmallVector<Expr *, 16> ConvertedArgs;
-  bool InitializationFailed = false;
+static bool
+convertArgsForAvailabilityChecks(Sema &S, FunctionDecl *Function, Expr *ThisArg,
+                                 ArrayRef<Expr *> Args, Sema::SFINAETrap &Trap,
+                                 bool MissingImplicitThis, Expr *&ConvertedThis,
+                                 SmallVectorImpl<Expr *> &ConvertedArgs) {
+  if (ThisArg) {
+    CXXMethodDecl *Method = cast<CXXMethodDecl>(Function);
+    assert(!isa<CXXConstructorDecl>(Method) &&
+           "Shouldn't have `this` for ctors!");
+    assert(!Method->isStatic() && "Shouldn't have `this` for static methods!");
+    ExprResult R = S.PerformObjectArgumentInitialization(
+        ThisArg, /*Qualifier=*/nullptr, Method, Method);
+    if (R.isInvalid())
+      return false;
+    ConvertedThis = R.get();
+  } else {
+    if (auto *MD = dyn_cast<CXXMethodDecl>(Function)) {
+      (void)MD;
+      assert((MissingImplicitThis || MD->isStatic() ||
+              isa<CXXConstructorDecl>(MD)) &&
+             "Expected `this` for non-ctor instance methods");
+    }
+    ConvertedThis = nullptr;
+  }
 
   // Ignore any variadic arguments. Converting them is pointless, since the
-  // user can't refer to them in the enable_if condition.
+  // user can't refer to them in the function condition.
   unsigned ArgSizeNoVarargs = std::min(Function->param_size(), Args.size());
 
   // Convert the arguments.
   for (unsigned I = 0; I != ArgSizeNoVarargs; ++I) {
     ExprResult R;
-    if (I == 0 && !MissingImplicitThis && isa<CXXMethodDecl>(Function) &&
-        !cast<CXXMethodDecl>(Function)->isStatic() &&
-        !isa<CXXConstructorDecl>(Function)) {
-      CXXMethodDecl *Method = cast<CXXMethodDecl>(Function);
-      R = PerformObjectArgumentInitialization(Args[0], /*Qualifier=*/nullptr,
-                                              Method, Method);
-    } else {
-      R = PerformCopyInitialization(InitializedEntity::InitializeParameter(
-                                        Context, Function->getParamDecl(I)),
+    R = S.PerformCopyInitialization(InitializedEntity::InitializeParameter(
+                                        S.Context, Function->getParamDecl(I)),
                                     SourceLocation(), Args[I]);
-    }
 
-    if (R.isInvalid()) {
-      InitializationFailed = true;
-      break;
-    }
+    if (R.isInvalid())
+      return false;
 
     ConvertedArgs.push_back(R.get());
   }
 
-  if (InitializationFailed || Trap.hasErrorOccurred())
-    return EnableIfAttrs[0];
+  if (Trap.hasErrorOccurred())
+    return false;
 
   // Push default arguments if needed.
   if (!Function->isVariadic() && Args.size() < Function->getNumParams()) {
     for (unsigned i = Args.size(), e = Function->getNumParams(); i != e; ++i) {
       ParmVarDecl *P = Function->getParamDecl(i);
-      ExprResult R = PerformCopyInitialization(
-          InitializedEntity::InitializeParameter(Context,
+      ExprResult R = S.PerformCopyInitialization(
+          InitializedEntity::InitializeParameter(S.Context,
                                                  Function->getParamDecl(i)),
           SourceLocation(),
           P->hasUninstantiatedDefaultArg() ? P->getUninstantiatedDefaultArg()
                                            : P->getDefaultArg());
-      if (R.isInvalid()) {
-        InitializationFailed = true;
-        break;
-      }
+      if (R.isInvalid())
+        return false;
       ConvertedArgs.push_back(R.get());
     }
 
-    if (InitializationFailed || Trap.hasErrorOccurred())
-      return EnableIfAttrs[0];
+    if (Trap.hasErrorOccurred())
+      return false;
   }
+  return true;
+}
+
+EnableIfAttr *Sema::CheckEnableIf(FunctionDecl *Function, ArrayRef<Expr *> Args,
+                                  bool MissingImplicitThis) {
+  SmallVector<EnableIfAttr *, 4> EnableIfAttrs =
+      getOrderedEnableIfAttrs(Function);
+  if (EnableIfAttrs.empty())
+    return nullptr;
+
+  SFINAETrap Trap(*this);
+  SmallVector<Expr *, 16> ConvertedArgs;
+  // FIXME: We should look into making enable_if late-parsed.
+  Expr *DiscardedThis;
+  if (!convertArgsForAvailabilityChecks(
+          *this, Function, /*ThisArg=*/nullptr, Args, Trap,
+          /*MissingImplicitThis=*/true, DiscardedThis, ConvertedArgs))
+    return EnableIfAttrs[0];
 
   for (auto *EIA : EnableIfAttrs) {
     APValue Result;
@@ -6170,6 +6263,87 @@ EnableIfAttr *Sema::CheckEnableIf(FunctionDecl *Function, ArrayRef<Expr *> Args,
   return nullptr;
 }
 
+static bool gatherDiagnoseIfAttrs(FunctionDecl *Function, bool ArgDependent,
+                                  SmallVectorImpl<DiagnoseIfAttr *> &Errors,
+                                  SmallVectorImpl<DiagnoseIfAttr *> &Nonfatal) {
+  for (auto *DIA : Function->specific_attrs<DiagnoseIfAttr>())
+    if (ArgDependent == DIA->getArgDependent()) {
+      if (DIA->isError())
+        Errors.push_back(DIA);
+      else
+        Nonfatal.push_back(DIA);
+    }
+
+  return !Errors.empty() || !Nonfatal.empty();
+}
+
+template <typename CheckFn>
+static DiagnoseIfAttr *
+checkDiagnoseIfAttrsWith(const SmallVectorImpl<DiagnoseIfAttr *> &Errors,
+                         SmallVectorImpl<DiagnoseIfAttr *> &Nonfatal,
+                         CheckFn &&IsSuccessful) {
+  // Note that diagnose_if attributes are late-parsed, so they appear in the
+  // correct order (unlike enable_if attributes).
+  auto ErrAttr = llvm::find_if(Errors, IsSuccessful);
+  if (ErrAttr != Errors.end())
+    return *ErrAttr;
+
+  llvm::erase_if(Nonfatal, [&](DiagnoseIfAttr *A) { return !IsSuccessful(A); });
+  return nullptr;
+}
+
+DiagnoseIfAttr *
+Sema::checkArgDependentDiagnoseIf(FunctionDecl *Function, ArrayRef<Expr *> Args,
+                                  SmallVectorImpl<DiagnoseIfAttr *> &Nonfatal,
+                                  bool MissingImplicitThis,
+                                  Expr *ThisArg) {
+  SmallVector<DiagnoseIfAttr *, 4> Errors;
+  if (!gatherDiagnoseIfAttrs(Function, /*ArgDependent=*/true, Errors, Nonfatal))
+    return nullptr;
+
+  SFINAETrap Trap(*this);
+  SmallVector<Expr *, 16> ConvertedArgs;
+  Expr *ConvertedThis;
+  if (!convertArgsForAvailabilityChecks(*this, Function, ThisArg, Args, Trap,
+                                        MissingImplicitThis, ConvertedThis,
+                                        ConvertedArgs))
+    return nullptr;
+
+  return checkDiagnoseIfAttrsWith(Errors, Nonfatal, [&](DiagnoseIfAttr *DIA) {
+    APValue Result;
+    // It's sane to use the same ConvertedArgs for any redecl of this function,
+    // since EvaluateWithSubstitution only cares about the position of each
+    // argument in the arg list, not the ParmVarDecl* it maps to.
+    if (!DIA->getCond()->EvaluateWithSubstitution(
+            Result, Context, DIA->getParent(), ConvertedArgs, ConvertedThis))
+      return false;
+    return Result.isInt() && Result.getInt().getBoolValue();
+  });
+}
+
+DiagnoseIfAttr *Sema::checkArgIndependentDiagnoseIf(
+    FunctionDecl *Function, SmallVectorImpl<DiagnoseIfAttr *> &Nonfatal) {
+  SmallVector<DiagnoseIfAttr *, 4> Errors;
+  if (!gatherDiagnoseIfAttrs(Function, /*ArgDependent=*/false, Errors,
+                             Nonfatal))
+    return nullptr;
+
+  return checkDiagnoseIfAttrsWith(Errors, Nonfatal, [&](DiagnoseIfAttr *DIA) {
+    bool Result;
+    return DIA->getCond()->EvaluateAsBooleanCondition(Result, Context) &&
+           Result;
+  });
+}
+
+void Sema::emitDiagnoseIfDiagnostic(SourceLocation Loc,
+                                    const DiagnoseIfAttr *DIA) {
+  auto Code = DIA->isError() ? diag::err_diagnose_if_succeeded
+                             : diag::warn_diagnose_if_succeeded;
+  Diag(Loc, Code) << DIA->getMessage();
+  Diag(DIA->getLocation(), diag::note_from_diagnose_if)
+      << DIA->getParent() << DIA->getCond()->getSourceRange();
+}
+
 /// \brief Add all of the function declarations in the given function set to
 /// the overload candidate set.
 void Sema::AddFunctionCandidates(const UnresolvedSetImpl &Fns,
@@ -6185,7 +6359,7 @@ void Sema::AddFunctionCandidates(const UnresolvedSetImpl &Fns,
         AddMethodCandidate(cast<CXXMethodDecl>(FD), F.getPair(),
                            cast<CXXMethodDecl>(FD)->getParent(),
                            Args[0]->getType(), Args[0]->Classify(Context),
-                           Args.slice(1), CandidateSet,
+                           Args[0], Args.slice(1), CandidateSet,
                            SuppressUserConversions, PartialOverloading);
       else
         AddOverloadCandidate(FD, F.getPair(), Args, CandidateSet,
@@ -6194,13 +6368,12 @@ void Sema::AddFunctionCandidates(const UnresolvedSetImpl &Fns,
       FunctionTemplateDecl *FunTmpl = cast<FunctionTemplateDecl>(D);
       if (isa<CXXMethodDecl>(FunTmpl->getTemplatedDecl()) &&
           !cast<CXXMethodDecl>(FunTmpl->getTemplatedDecl())->isStatic())
-        AddMethodTemplateCandidate(FunTmpl, F.getPair(),
-                              cast<CXXRecordDecl>(FunTmpl->getDeclContext()),
-                                   ExplicitTemplateArgs,
-                                   Args[0]->getType(),
-                                   Args[0]->Classify(Context), Args.slice(1),
-                                   CandidateSet, SuppressUserConversions,
-                                   PartialOverloading);
+        AddMethodTemplateCandidate(
+            FunTmpl, F.getPair(),
+            cast<CXXRecordDecl>(FunTmpl->getDeclContext()),
+            ExplicitTemplateArgs, Args[0]->getType(),
+            Args[0]->Classify(Context), Args[0], Args.slice(1), CandidateSet,
+            SuppressUserConversions, PartialOverloading);
       else
         AddTemplateOverloadCandidate(FunTmpl, F.getPair(),
                                      ExplicitTemplateArgs, Args,
@@ -6215,6 +6388,7 @@ void Sema::AddFunctionCandidates(const UnresolvedSetImpl &Fns,
 void Sema::AddMethodCandidate(DeclAccessPair FoundDecl,
                               QualType ObjectType,
                               Expr::Classification ObjectClassification,
+                              Expr *ThisArg,
                               ArrayRef<Expr *> Args,
                               OverloadCandidateSet& CandidateSet,
                               bool SuppressUserConversions) {
@@ -6230,12 +6404,12 @@ void Sema::AddMethodCandidate(DeclAccessPair FoundDecl,
     AddMethodTemplateCandidate(TD, FoundDecl, ActingContext,
                                /*ExplicitArgs*/ nullptr,
                                ObjectType, ObjectClassification,
-                               Args, CandidateSet,
+                               ThisArg, Args, CandidateSet,
                                SuppressUserConversions);
   } else {
     AddMethodCandidate(cast<CXXMethodDecl>(Decl), FoundDecl, ActingContext,
                        ObjectType, ObjectClassification,
-                       Args,
+                       ThisArg, Args,
                        CandidateSet, SuppressUserConversions);
   }
 }
@@ -6251,10 +6425,11 @@ void
 Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
                          CXXRecordDecl *ActingContext, QualType ObjectType,
                          Expr::Classification ObjectClassification,
-                         ArrayRef<Expr *> Args,
+                         Expr *ThisArg, ArrayRef<Expr *> Args,
                          OverloadCandidateSet &CandidateSet,
                          bool SuppressUserConversions,
-                         bool PartialOverloading) {
+                         bool PartialOverloading,
+                         ConversionSequenceList EarlyConversions) {
   const FunctionProtoType *Proto
     = dyn_cast<FunctionProtoType>(Method->getType()->getAs<FunctionType>());
   assert(Proto && "Methods without a prototype cannot be overloaded");
@@ -6275,7 +6450,8 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
   EnterExpressionEvaluationContext Unevaluated(*this, Sema::Unevaluated);
 
   // Add this candidate
-  OverloadCandidate &Candidate = CandidateSet.addCandidate(Args.size() + 1);
+  OverloadCandidate &Candidate =
+      CandidateSet.addCandidate(Args.size() + 1, EarlyConversions);
   Candidate.FoundDecl = FoundDecl;
   Candidate.Function = Method;
   Candidate.IsSurrogate = false;
@@ -6337,7 +6513,10 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
   // Determine the implicit conversion sequences for each of the
   // arguments.
   for (unsigned ArgIdx = 0; ArgIdx < Args.size(); ++ArgIdx) {
-    if (ArgIdx < NumParams) {
+    if (Candidate.Conversions[ArgIdx + 1].isInitialized()) {
+      // We already formed a conversion sequence for this parameter during
+      // template argument deduction.
+    } else if (ArgIdx < NumParams) {
       // (C++ 13.3.2p3): for F to be a viable function, there shall
       // exist for each argument an implicit conversion sequence
       // (13.3.3.1) that converts that argument to the corresponding
@@ -6368,6 +6547,9 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
     Candidate.DeductionFailure.Data = FailedAttr;
     return;
   }
+
+  initDiagnoseIfComplaint(*this, CandidateSet, Candidate, Method, Args,
+                          /*MissingImplicitThis=*/!ThisArg, ThisArg);
 }
 
 /// \brief Add a C++ member function template as a candidate to the candidate
@@ -6380,6 +6562,7 @@ Sema::AddMethodTemplateCandidate(FunctionTemplateDecl *MethodTmpl,
                                  TemplateArgumentListInfo *ExplicitTemplateArgs,
                                  QualType ObjectType,
                                  Expr::Classification ObjectClassification,
+                                 Expr *ThisArg,
                                  ArrayRef<Expr *> Args,
                                  OverloadCandidateSet& CandidateSet,
                                  bool SuppressUserConversions,
@@ -6398,19 +6581,30 @@ Sema::AddMethodTemplateCandidate(FunctionTemplateDecl *MethodTmpl,
   //   functions.
   TemplateDeductionInfo Info(CandidateSet.getLocation());
   FunctionDecl *Specialization = nullptr;
-  if (TemplateDeductionResult Result
-      = DeduceTemplateArguments(MethodTmpl, ExplicitTemplateArgs, Args,
-                                Specialization, Info, PartialOverloading)) {
-    OverloadCandidate &Candidate = CandidateSet.addCandidate();
+  ConversionSequenceList Conversions;
+  if (TemplateDeductionResult Result = DeduceTemplateArguments(
+          MethodTmpl, ExplicitTemplateArgs, Args, Specialization, Info,
+          PartialOverloading, [&](ArrayRef<QualType> ParamTypes) {
+            return CheckNonDependentConversions(
+                MethodTmpl, ParamTypes, Args, CandidateSet, Conversions,
+                SuppressUserConversions, ActingContext, ObjectType,
+                ObjectClassification);
+          })) {
+    OverloadCandidate &Candidate =
+        CandidateSet.addCandidate(Conversions.size(), Conversions);
     Candidate.FoundDecl = FoundDecl;
     Candidate.Function = MethodTmpl->getTemplatedDecl();
     Candidate.Viable = false;
-    Candidate.FailureKind = ovl_fail_bad_deduction;
     Candidate.IsSurrogate = false;
     Candidate.IgnoreObjectArgument = false;
     Candidate.ExplicitCallArguments = Args.size();
-    Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result,
-                                                          Info);
+    if (Result == TDK_NonDependentConversionFailure)
+      Candidate.FailureKind = ovl_fail_bad_conversion;
+    else {
+      Candidate.FailureKind = ovl_fail_bad_deduction;
+      Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result,
+                                                            Info);
+    }
     return;
   }
 
@@ -6420,8 +6614,9 @@ Sema::AddMethodTemplateCandidate(FunctionTemplateDecl *MethodTmpl,
   assert(isa<CXXMethodDecl>(Specialization) &&
          "Specialization is not a member function?");
   AddMethodCandidate(cast<CXXMethodDecl>(Specialization), FoundDecl,
-                     ActingContext, ObjectType, ObjectClassification, Args,
-                     CandidateSet, SuppressUserConversions, PartialOverloading);
+                     ActingContext, ObjectType, ObjectClassification,
+                     /*ThisArg=*/ThisArg, Args, CandidateSet,
+                     SuppressUserConversions, PartialOverloading, Conversions);
 }
 
 /// \brief Add a C++ function template specialization as a candidate
@@ -6449,19 +6644,29 @@ Sema::AddTemplateOverloadCandidate(FunctionTemplateDecl *FunctionTemplate,
   //   functions.
   TemplateDeductionInfo Info(CandidateSet.getLocation());
   FunctionDecl *Specialization = nullptr;
-  if (TemplateDeductionResult Result
-        = DeduceTemplateArguments(FunctionTemplate, ExplicitTemplateArgs, Args,
-                                  Specialization, Info, PartialOverloading)) {
-    OverloadCandidate &Candidate = CandidateSet.addCandidate();
+  ConversionSequenceList Conversions;
+  if (TemplateDeductionResult Result = DeduceTemplateArguments(
+          FunctionTemplate, ExplicitTemplateArgs, Args, Specialization, Info,
+          PartialOverloading, [&](ArrayRef<QualType> ParamTypes) {
+            return CheckNonDependentConversions(FunctionTemplate, ParamTypes,
+                                                Args, CandidateSet, Conversions,
+                                                SuppressUserConversions);
+          })) {
+    OverloadCandidate &Candidate =
+        CandidateSet.addCandidate(Conversions.size(), Conversions);
     Candidate.FoundDecl = FoundDecl;
     Candidate.Function = FunctionTemplate->getTemplatedDecl();
     Candidate.Viable = false;
-    Candidate.FailureKind = ovl_fail_bad_deduction;
     Candidate.IsSurrogate = false;
     Candidate.IgnoreObjectArgument = false;
     Candidate.ExplicitCallArguments = Args.size();
-    Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result,
-                                                          Info);
+    if (Result == TDK_NonDependentConversionFailure)
+      Candidate.FailureKind = ovl_fail_bad_conversion;
+    else {
+      Candidate.FailureKind = ovl_fail_bad_deduction;
+      Candidate.DeductionFailure = MakeDeductionFailureInfo(Context, Result,
+                                                            Info);
+    }
     return;
   }
 
@@ -6469,7 +6674,64 @@ Sema::AddTemplateOverloadCandidate(FunctionTemplateDecl *FunctionTemplate,
   // deduction as a candidate.
   assert(Specialization && "Missing function template specialization?");
   AddOverloadCandidate(Specialization, FoundDecl, Args, CandidateSet,
-                       SuppressUserConversions, PartialOverloading);
+                       SuppressUserConversions, PartialOverloading,
+                       /*AllowExplicit*/false, Conversions);
+}
+
+/// Check that implicit conversion sequences can be formed for each argument
+/// whose corresponding parameter has a non-dependent type, per DR1391's
+/// [temp.deduct.call]p10.
+bool Sema::CheckNonDependentConversions(
+    FunctionTemplateDecl *FunctionTemplate, ArrayRef<QualType> ParamTypes,
+    ArrayRef<Expr *> Args, OverloadCandidateSet &CandidateSet,
+    ConversionSequenceList &Conversions, bool SuppressUserConversions,
+    CXXRecordDecl *ActingContext, QualType ObjectType,
+    Expr::Classification ObjectClassification) {
+  // FIXME: The cases in which we allow explicit conversions for constructor
+  // arguments never consider calling a constructor template. It's not clear
+  // that is correct.
+  const bool AllowExplicit = false;
+
+  auto *FD = FunctionTemplate->getTemplatedDecl();
+  auto *Method = dyn_cast<CXXMethodDecl>(FD);
+  bool HasThisConversion = Method && !isa<CXXConstructorDecl>(Method);
+  unsigned ThisConversions = HasThisConversion ? 1 : 0;
+
+  Conversions =
+      CandidateSet.allocateConversionSequences(ThisConversions + Args.size());
+
+  // Overload resolution is always an unevaluated context.
+  EnterExpressionEvaluationContext Unevaluated(*this, Sema::Unevaluated);
+
+  // For a method call, check the 'this' conversion here too. DR1391 doesn't
+  // require that, but this check should never result in a hard error, and
+  // overload resolution is permitted to sidestep instantiations.
+  if (HasThisConversion && !cast<CXXMethodDecl>(FD)->isStatic() &&
+      !ObjectType.isNull()) {
+    Conversions[0] = TryObjectArgumentInitialization(
+        *this, CandidateSet.getLocation(), ObjectType, ObjectClassification,
+        Method, ActingContext);
+    if (Conversions[0].isBad())
+      return true;
+  }
+
+  for (unsigned I = 0, N = std::min(ParamTypes.size(), Args.size()); I != N;
+       ++I) {
+    QualType ParamType = ParamTypes[I];
+    if (!ParamType->isDependentType()) {
+      Conversions[ThisConversions + I]
+        = TryCopyInitialization(*this, Args[I], ParamType,
+                                SuppressUserConversions,
+                                /*InOverloadResolution=*/true,
+                                /*AllowObjCWritebackConversion=*/
+                                  getLangOpts().ObjCAutoRefCount,
+                                AllowExplicit);
+      if (Conversions[ThisConversions + I].isBad())
+        return true;
+    }
+  }
+
+  return false;
 }
 
 /// Determine whether this is an allowable conversion from the result
@@ -6677,6 +6939,8 @@ Sema::AddConversionCandidate(CXXConversionDecl *Conversion,
     Candidate.DeductionFailure.Data = FailedAttr;
     return;
   }
+
+  initDiagnoseIfComplaint(*this, CandidateSet, Candidate, Conversion, None, false, From);
 }
 
 /// \brief Adds a conversion function template specialization
@@ -6829,6 +7093,8 @@ void Sema::AddSurrogateCandidate(CXXConversionDecl *Conversion,
     Candidate.DeductionFailure.Data = FailedAttr;
     return;
   }
+
+  initDiagnoseIfComplaint(*this, CandidateSet, Candidate, Conversion, None);
 }
 
 /// \brief Add overload candidates for overloaded operators that are
@@ -6877,10 +7143,8 @@ void Sema::AddMemberOperatorCandidates(OverloadedOperatorKind Op,
          Oper != OperEnd;
          ++Oper)
       AddMethodCandidate(Oper.getPair(), Args[0]->getType(),
-                         Args[0]->Classify(Context), 
-                         Args.slice(1),
-                         CandidateSet,
-                         /* SuppressUserConversions = */ false);
+                         Args[0]->Classify(Context), Args[0], Args.slice(1),
+                         CandidateSet, /*SuppressUserConversions=*/false);
   }
 }
 
@@ -8708,8 +8972,8 @@ bool clang::isBetterOverloadCandidate(Sema &S, const OverloadCandidate &Cand1,
 
   // Define functions that don't require ill-formed conversions for a given
   // argument to be better candidates than functions that do.
-  unsigned NumArgs = Cand1.NumConversions;
-  assert(Cand2.NumConversions == NumArgs && "Overload candidate mismatch");
+  unsigned NumArgs = Cand1.Conversions.size();
+  assert(Cand2.Conversions.size() == NumArgs && "Overload candidate mismatch");
   bool HasBetterConversion = false;
   for (unsigned ArgIdx = StartArg; ArgIdx < NumArgs; ++ArgIdx) {
     bool Cand1Bad = IsIllFormedConversion(Cand1.Conversions[ArgIdx]);
@@ -8911,6 +9175,17 @@ void Sema::diagnoseEquivalentInternalLinkageDeclarations(
   }
 }
 
+static bool isCandidateUnavailableDueToDiagnoseIf(const OverloadCandidate &OC) {
+  ArrayRef<DiagnoseIfAttr *> Info = OC.getDiagnoseIfInfo();
+  if (!Info.empty() && Info[0]->isError())
+    return true;
+
+  assert(llvm::all_of(Info,
+                      [](const DiagnoseIfAttr *A) { return !A->isError(); }) &&
+         "DiagnoseIf info shouldn't have mixed warnings and errors.");
+  return false;
+}
+
 /// \brief Computes the best viable function (C++ 13.3.3)
 /// within an overload candidate set.
 ///
@@ -8989,13 +9264,19 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc,
   // Best is the best viable function.
   if (Best->Function &&
       (Best->Function->isDeleted() ||
-       S.isFunctionConsideredUnavailable(Best->Function)))
+       S.isFunctionConsideredUnavailable(Best->Function) ||
+       isCandidateUnavailableDueToDiagnoseIf(*Best)))
     return OR_Deleted;
 
   if (!EquivalentCands.empty())
     S.diagnoseEquivalentInternalLinkageDeclarations(Loc, Best->Function,
                                                     EquivalentCands);
 
+  for (const auto *W : Best->getDiagnoseIfInfo()) {
+    assert(W->isWarning() && "Errors should've been caught earlier!");
+    S.emitDiagnoseIfDiagnostic(Loc, W);
+  }
+
   return OR_Success;
 }
 
@@ -9836,7 +10117,7 @@ static void DiagnoseFailedEnableIfAttr(Sema &S, OverloadCandidate *Cand) {
   EnableIfAttr *Attr = static_cast<EnableIfAttr*>(Cand->DeductionFailure.Data);
 
   S.Diag(Callee->getLocation(),
-         diag::note_ovl_candidate_disabled_by_enable_if_attr)
+         diag::note_ovl_candidate_disabled_by_function_cond_attr)
       << Attr->getCond()->getSourceRange() << Attr->getMessage();
 }
 
@@ -9866,21 +10147,28 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
   FunctionDecl *Fn = Cand->Function;
 
   // Note deleted candidates, but only if they're viable.
-  if (Cand->Viable && (Fn->isDeleted() ||
-      S.isFunctionConsideredUnavailable(Fn))) {
-    std::string FnDesc;
-    OverloadCandidateKind FnKind =
+  if (Cand->Viable) {
+    if (Fn->isDeleted() || S.isFunctionConsideredUnavailable(Fn)) {
+      std::string FnDesc;
+      OverloadCandidateKind FnKind =
         ClassifyOverloadCandidate(S, Cand->FoundDecl, Fn, FnDesc);
 
-    S.Diag(Fn->getLocation(), diag::note_ovl_candidate_deleted)
-      << FnKind << FnDesc
-      << (Fn->isDeleted() ? (Fn->isDeletedAsWritten() ? 1 : 2) : 0);
-    MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
-    return;
-  }
+      S.Diag(Fn->getLocation(), diag::note_ovl_candidate_deleted)
+        << FnKind << FnDesc
+        << (Fn->isDeleted() ? (Fn->isDeletedAsWritten() ? 1 : 2) : 0);
+      MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
+      return;
+    }
+    if (isCandidateUnavailableDueToDiagnoseIf(*Cand)) {
+      auto *A = Cand->DiagnoseIfInfo.get<DiagnoseIfAttr *>();
+      assert(A->isError() && "Non-error diagnose_if disables a candidate?");
+      S.Diag(Cand->Function->getLocation(),
+             diag::note_ovl_candidate_disabled_by_function_cond_attr)
+          << A->getCond()->getSourceRange() << A->getMessage();
+      return;
+    }
 
-  // We don't really have anything else to say about viable candidates.
-  if (Cand->Viable) {
+    // We don't really have anything else to say about viable candidates.
     S.NoteOverloadCandidate(Cand->FoundDecl, Fn);
     return;
   }
@@ -9908,7 +10196,7 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
 
   case ovl_fail_bad_conversion: {
     unsigned I = (Cand->IgnoreObjectArgument ? 1 : 0);
-    for (unsigned N = Cand->NumConversions; I != N; ++I)
+    for (unsigned N = Cand->Conversions.size(); I != N; ++I)
       if (Cand->Conversions[I].isBad())
         return DiagnoseBadConversion(S, Cand, I, TakingCandidateAddress);
 
@@ -9927,6 +10215,12 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
   case ovl_fail_ext_disabled:
     return DiagnoseOpenCLExtensionDisabled(S, Cand);
 
+  case ovl_fail_inhctor_slice:
+    S.Diag(Fn->getLocation(),
+           diag::note_ovl_candidate_inherited_constructor_slice);
+    MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
+    return;
+
   case ovl_fail_addr_not_available: {
     bool Available = checkAddressOfCandidateIsAvailable(S, Cand->Function);
     (void)Available;
@@ -9971,12 +10265,12 @@ static void NoteSurrogateCandidate(Sema &S, OverloadCandidate *Cand) {
 static void NoteBuiltinOperatorCandidate(Sema &S, StringRef Opc,
                                          SourceLocation OpLoc,
                                          OverloadCandidate *Cand) {
-  assert(Cand->NumConversions <= 2 && "builtin operator is not binary");
+  assert(Cand->Conversions.size() <= 2 && "builtin operator is not binary");
   std::string TypeStr("operator");
   TypeStr += Opc;
   TypeStr += "(";
   TypeStr += Cand->BuiltinTypes.ParamTypes[0].getAsString();
-  if (Cand->NumConversions == 1) {
+  if (Cand->Conversions.size() == 1) {
     TypeStr += ")";
     S.Diag(OpLoc, diag::note_ovl_builtin_unary_candidate) << TypeStr;
   } else {
@@ -9989,9 +10283,7 @@ static void NoteBuiltinOperatorCandidate(Sema &S, StringRef Opc,
 
 static void NoteAmbiguousUserConversions(Sema &S, SourceLocation OpLoc,
                                          OverloadCandidate *Cand) {
-  unsigned NoOperands = Cand->NumConversions;
-  for (unsigned ArgIdx = 0; ArgIdx < NoOperands; ++ArgIdx) {
-    const ImplicitConversionSequence &ICS = Cand->Conversions[ArgIdx];
+  for (const ImplicitConversionSequence &ICS : Cand->Conversions) {
     if (ICS.isBad()) break; // all meaningless after first invalid
     if (!ICS.isAmbiguous()) continue;
 
@@ -10011,7 +10303,8 @@ static SourceLocation GetLocationForCandidate(const OverloadCandidate *Cand) {
 static unsigned RankDeductionFailure(const DeductionFailureInfo &DFI) {
   switch ((Sema::TemplateDeductionResult)DFI.Result) {
   case Sema::TDK_Success:
-    llvm_unreachable("TDK_success while diagnosing bad deduction");
+  case Sema::TDK_NonDependentConversionFailure:
+    llvm_unreachable("non-deduction failure while diagnosing bad deduction");
 
   case Sema::TDK_Invalid:
   case Sema::TDK_Incomplete:
@@ -10114,11 +10407,11 @@ struct CompareOverloadCandidatesForDisplay {
 
         // If there's any ordering between the defined conversions...
         // FIXME: this might not be transitive.
-        assert(L->NumConversions == R->NumConversions);
+        assert(L->Conversions.size() == R->Conversions.size());
 
         int leftBetter = 0;
         unsigned I = (L->IgnoreObjectArgument || R->IgnoreObjectArgument);
-        for (unsigned E = L->NumConversions; I != E; ++I) {
+        for (unsigned E = L->Conversions.size(); I != E; ++I) {
           switch (CompareImplicitConversionSequences(S, Loc,
                                                      L->Conversions[I],
                                                      R->Conversions[I])) {
@@ -10167,7 +10460,8 @@ struct CompareOverloadCandidatesForDisplay {
 }
 
 /// CompleteNonViableCandidate - Normally, overload resolution only
-/// computes up to the first. Produces the FixIt set if possible.
+/// computes up to the first bad conversion. Produces the FixIt set if
+/// possible.
 static void CompleteNonViableCandidate(Sema &S, OverloadCandidate *Cand,
                                        ArrayRef<Expr *> Args) {
   assert(!Cand->Viable);
@@ -10180,30 +10474,24 @@ static void CompleteNonViableCandidate(Sema &S, OverloadCandidate *Cand,
   // Use a implicit copy initialization to check conversion fixes.
   Cand->Fix.setConversionChecker(TryCopyInitialization);
 
-  // Skip forward to the first bad conversion.
-  unsigned ConvIdx = (Cand->IgnoreObjectArgument ? 1 : 0);
-  unsigned ConvCount = Cand->NumConversions;
-  while (true) {
+  // Attempt to fix the bad conversion.
+  unsigned ConvCount = Cand->Conversions.size();
+  for (unsigned ConvIdx = (Cand->IgnoreObjectArgument ? 1 : 0); /**/;
+       ++ConvIdx) {
     assert(ConvIdx != ConvCount && "no bad conversion in candidate");
-    ConvIdx++;
-    if (Cand->Conversions[ConvIdx - 1].isBad()) {
-      Unfixable = !Cand->TryToFixBadConversion(ConvIdx - 1, S);
+    if (Cand->Conversions[ConvIdx].isInitialized() &&
+        Cand->Conversions[ConvIdx].isBad()) {
+      Unfixable = !Cand->TryToFixBadConversion(ConvIdx, S);
       break;
     }
   }
 
-  if (ConvIdx == ConvCount)
-    return;
-
-  assert(!Cand->Conversions[ConvIdx].isInitialized() &&
-         "remaining conversion is initialized?");
-
   // FIXME: this should probably be preserved from the overload
   // operation somehow.
   bool SuppressUserConversions = false;
 
-  const FunctionProtoType* Proto;
-  unsigned ArgIdx = ConvIdx;
+  const FunctionProtoType *Proto;
+  unsigned ArgIdx = 0;
 
   if (Cand->IsSurrogate) {
     QualType ConvType
@@ -10211,40 +10499,56 @@ static void CompleteNonViableCandidate(Sema &S, OverloadCandidate *Cand,
     if (const PointerType *ConvPtrType = ConvType->getAs<PointerType>())
       ConvType = ConvPtrType->getPointeeType();
     Proto = ConvType->getAs<FunctionProtoType>();
-    ArgIdx--;
+    ArgIdx = 1;
   } else if (Cand->Function) {
     Proto = Cand->Function->getType()->getAs<FunctionProtoType>();
     if (isa<CXXMethodDecl>(Cand->Function) &&
         !isa<CXXConstructorDecl>(Cand->Function))
-      ArgIdx--;
+      ArgIdx = 1;
   } else {
     // Builtin binary operator with a bad first conversion.
     assert(ConvCount <= 3);
-    for (; ConvIdx != ConvCount; ++ConvIdx)
-      Cand->Conversions[ConvIdx]
-        = TryCopyInitialization(S, Args[ConvIdx],
-                                Cand->BuiltinTypes.ParamTypes[ConvIdx],
-                                SuppressUserConversions,
-                                /*InOverloadResolution*/ true,
-                                /*AllowObjCWritebackConversion=*/
-                                  S.getLangOpts().ObjCAutoRefCount);
+    for (unsigned ConvIdx = (Cand->IgnoreObjectArgument ? 1 : 0);
+         ConvIdx != ConvCount; ++ConvIdx) {
+      if (Cand->Conversions[ConvIdx].isInitialized())
+        continue;
+      if (Cand->BuiltinTypes.ParamTypes[ConvIdx]->isDependentType())
+        Cand->Conversions[ConvIdx].setAsIdentityConversion(
+            Args[ConvIdx]->getType());
+      else
+        Cand->Conversions[ConvIdx] = TryCopyInitialization(
+            S, Args[ConvIdx], Cand->BuiltinTypes.ParamTypes[ConvIdx],
+            SuppressUserConversions,
+            /*InOverloadResolution*/ true,
+            /*AllowObjCWritebackConversion=*/
+            S.getLangOpts().ObjCAutoRefCount);
+      // FIXME: If the conversion is bad, try to fix it.
+    }
     return;
   }
 
   // Fill in the rest of the conversions.
   unsigned NumParams = Proto->getNumParams();
-  for (; ConvIdx != ConvCount; ++ConvIdx, ++ArgIdx) {
-    if (ArgIdx < NumParams) {
-      Cand->Conversions[ConvIdx] = TryCopyInitialization(
-          S, Args[ArgIdx], Proto->getParamType(ArgIdx), SuppressUserConversions,
-          /*InOverloadResolution=*/true,
-          /*AllowObjCWritebackConversion=*/
-          S.getLangOpts().ObjCAutoRefCount);
-      // Store the FixIt in the candidate if it exists.
-      if (!Unfixable && Cand->Conversions[ConvIdx].isBad())
-        Unfixable = !Cand->TryToFixBadConversion(ConvIdx, S);
-    }
-    else
+  for (unsigned ConvIdx = (Cand->IgnoreObjectArgument ? 1 : 0);
+       ConvIdx != ConvCount; ++ConvIdx, ++ArgIdx) {
+    if (Cand->Conversions[ConvIdx].isInitialized()) {
+      // Found the bad conversion.
+    } else if (ArgIdx < NumParams) {
+      if (Proto->getParamType(ArgIdx)->isDependentType())
+        Cand->Conversions[ConvIdx].setAsIdentityConversion(
+            Args[ArgIdx]->getType());
+      else {
+        Cand->Conversions[ConvIdx] =
+            TryCopyInitialization(S, Args[ArgIdx], Proto->getParamType(ArgIdx),
+                                  SuppressUserConversions,
+                                  /*InOverloadResolution=*/true,
+                                  /*AllowObjCWritebackConversion=*/
+                                  S.getLangOpts().ObjCAutoRefCount);
+        // Store the FixIt in the candidate if it exists.
+        if (!Unfixable && Cand->Conversions[ConvIdx].isBad())
+          Unfixable = !Cand->TryToFixBadConversion(ConvIdx, S);
+      }
+    } else
       Cand->Conversions[ConvIdx].setEllipsis();
   }
 }
@@ -12429,6 +12733,16 @@ Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
       TemplateArgs = &TemplateArgsBuffer;
     }
 
+    // Poor-programmer's Lazy<Expr *>; isImplicitAccess requires stripping
+    // parens/casts, which would be nice to avoid potentially doing multiple
+    // times.
+    llvm::Optional<Expr *> UnresolvedBase;
+    auto GetUnresolvedBase = [&] {
+      if (!UnresolvedBase.hasValue())
+        UnresolvedBase =
+          UnresExpr->isImplicitAccess() ? nullptr : UnresExpr->getBase();
+      return *UnresolvedBase;
+    };
     for (UnresolvedMemberExpr::decls_iterator I = UnresExpr->decls_begin(),
            E = UnresExpr->decls_end(); I != E; ++I) {
 
@@ -12449,14 +12763,15 @@ Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
           continue;
 
         AddMethodCandidate(Method, I.getPair(), ActingDC, ObjectType,
-                           ObjectClassification, Args, CandidateSet,
+                           ObjectClassification,
+                           /*ThisArg=*/GetUnresolvedBase(), Args, CandidateSet,
                            /*SuppressUserConversions=*/false);
       } else {
-        AddMethodTemplateCandidate(cast<FunctionTemplateDecl>(Func),
-                                   I.getPair(), ActingDC, TemplateArgs,
-                                   ObjectType,  ObjectClassification,
-                                   Args, CandidateSet,
-                                   /*SuppressUsedConversions=*/false);
+        AddMethodTemplateCandidate(
+            cast<FunctionTemplateDecl>(Func), I.getPair(), ActingDC,
+            TemplateArgs, ObjectType, ObjectClassification,
+            /*ThisArg=*/GetUnresolvedBase(), Args, CandidateSet,
+            /*SuppressUsedConversions=*/false);
       }
     }
 
@@ -12569,10 +12884,20 @@ Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
            diag::err_ovl_no_viable_member_function_in_call)
           << Method << Method->getSourceRange();
       Diag(Method->getLocation(),
-           diag::note_ovl_candidate_disabled_by_enable_if_attr)
+           diag::note_ovl_candidate_disabled_by_function_cond_attr)
           << Attr->getCond()->getSourceRange() << Attr->getMessage();
       return ExprError();
     }
+
+    SmallVector<DiagnoseIfAttr *, 4> Nonfatal;
+    if (const DiagnoseIfAttr *Attr = checkArgDependentDiagnoseIf(
+            Method, Args, Nonfatal, false, MemE->getBase())) {
+      emitDiagnoseIfDiagnostic(MemE->getMemberLoc(), Attr);
+      return ExprError();
+    }
+
+    for (const auto *Attr : Nonfatal)
+      emitDiagnoseIfDiagnostic(MemE->getMemberLoc(), Attr);
   }
 
   if ((isa<CXXConstructorDecl>(CurContext) || 
@@ -12652,7 +12977,7 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj,
        Oper != OperEnd; ++Oper) {
     AddMethodCandidate(Oper.getPair(), Object.get()->getType(),
                        Object.get()->Classify(Context),
-                       Args, CandidateSet,
+                       Object.get(), Args, CandidateSet,
                        /*SuppressUserConversions=*/ false);
   }
 
@@ -12928,7 +13253,8 @@ Sema::BuildOverloadedArrowExpr(Scope *S, Expr *Base, SourceLocation OpLoc,
   for (LookupResult::iterator Oper = R.begin(), OperEnd = R.end();
        Oper != OperEnd; ++Oper) {
     AddMethodCandidate(Oper.getPair(), Base->getType(), Base->Classify(Context),
-                       None, CandidateSet, /*SuppressUserConversions=*/false);
+                       Base, None, CandidateSet,
+                       /*SuppressUserConversions=*/false);
   }
 
   bool HadMultipleCandidates = (CandidateSet.size() > 1);
diff --git a/lib/Sema/SemaTemplate.cpp b/lib/Sema/SemaTemplate.cpp
index 66a10ef7993e..795e6025d96f 100644
--- a/lib/Sema/SemaTemplate.cpp
+++ b/lib/Sema/SemaTemplate.cpp
@@ -5158,6 +5158,11 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
     return Arg;
   }
 
+  // The initialization of the parameter from the argument is
+  // a constant-evaluated context.
+  EnterExpressionEvaluationContext ConstantEvaluated(*this,
+                                                     Sema::ConstantEvaluated);
+
   if (getLangOpts().CPlusPlus1z) {
     // C++1z [temp.arg.nontype]p1:
     //   A template-argument for a non-type template parameter shall be
diff --git a/lib/Sema/SemaTemplateDeduction.cpp b/lib/Sema/SemaTemplateDeduction.cpp
index b79904c0a703..93e796ee9668 100644
--- a/lib/Sema/SemaTemplateDeduction.cpp
+++ b/lib/Sema/SemaTemplateDeduction.cpp
@@ -669,6 +669,19 @@ public:
       Info.PendingDeducedPacks[Pack.Index] = Pack.Outer;
   }
 
+  /// Determine whether this pack has already been partially expanded into a
+  /// sequence of (prior) function parameters / template arguments.
+  bool isPartiallyExpanded() {
+    if (Packs.size() != 1 || !S.CurrentInstantiationScope)
+      return false;
+
+    auto *PartiallySubstitutedPack =
+        S.CurrentInstantiationScope->getPartiallySubstitutedPack();
+    return PartiallySubstitutedPack &&
+           getDepthAndIndex(PartiallySubstitutedPack) ==
+               std::make_pair(Info.getDeducedDepth(), Packs.front().Index);
+  }
+
   /// Move to deducing the next element in each pack that is being deduced.
   void nextPackElement() {
     // Capture the deduced template arguments for each parameter pack expanded
@@ -2552,6 +2565,12 @@ static bool isSimpleTemplateIdType(QualType T) {
   return false;
 }
 
+static void
+MarkUsedTemplateParameters(ASTContext &Ctx, QualType T,
+                           bool OnlyDeduced,
+                           unsigned Level,
+                           llvm::SmallBitVector &Deduced);
+
 /// \brief Substitute the explicitly-provided template arguments into the
 /// given function template according to C++ [temp.arg.explicit].
 ///
@@ -2613,7 +2632,7 @@ Sema::SubstituteExplicitTemplateArguments(
   // Enter a new template instantiation context where we check the
   // explicitly-specified template arguments against this function template,
   // and then substitute them into the function parameter types.
-  SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
+  SmallVector<TemplateArgument, 4> DeducedArgs;
   InstantiatingTemplate Inst(*this, Info.getLocation(), FunctionTemplate,
                              DeducedArgs,
            ActiveTemplateInstantiation::ExplicitTemplateArgumentSubstitution,
@@ -2893,14 +2912,13 @@ static unsigned getPackIndexForParam(Sema &S,
 ///
 /// \param OriginalCallArgs If non-NULL, the original call arguments against
 /// which the deduced argument types should be compared.
-Sema::TemplateDeductionResult
-Sema::FinishTemplateArgumentDeduction(FunctionTemplateDecl *FunctionTemplate,
-                       SmallVectorImpl<DeducedTemplateArgument> &Deduced,
-                                      unsigned NumExplicitlySpecified,
-                                      FunctionDecl *&Specialization,
-                                      TemplateDeductionInfo &Info,
-        SmallVectorImpl<OriginalCallArg> const *OriginalCallArgs,
-                                      bool PartialOverloading) {
+Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
+    FunctionTemplateDecl *FunctionTemplate,
+    SmallVectorImpl<DeducedTemplateArgument> &Deduced,
+    unsigned NumExplicitlySpecified, FunctionDecl *&Specialization,
+    TemplateDeductionInfo &Info,
+    SmallVectorImpl<OriginalCallArg> const *OriginalCallArgs,
+    bool PartialOverloading, llvm::function_ref<bool()> CheckNonDependent) {
   // Unevaluated SFINAE context.
   EnterExpressionEvaluationContext Unevaluated(*this, Sema::Unevaluated);
   SFINAETrap Trap(*this);
@@ -2927,6 +2945,18 @@ Sema::FinishTemplateArgumentDeduction(FunctionTemplateDecl *FunctionTemplate,
           PartialOverloading))
     return Result;
 
+  // C++ [temp.deduct.call]p10: [DR1391]
+  //   If deduction succeeds for all parameters that contain
+  //   template-parameters that participate in template argument deduction,
+  //   and all template arguments are explicitly specified, deduced, or
+  //   obtained from default template arguments, remaining parameters are then
+  //   compared with the corresponding arguments. For each remaining parameter
+  //   P with a type that was non-dependent before substitution of any
+  //   explicitly-specified template arguments, if the corresponding argument
+  //   A cannot be implicitly converted to P, deduction fails.
+  if (CheckNonDependent())
+    return TDK_NonDependentConversionFailure;
+
   // Form the template argument list from the deduced template arguments.
   TemplateArgumentList *DeducedArgumentList
     = TemplateArgumentList::CreateCopy(Context, Builder);
@@ -3373,12 +3403,19 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument(
 /// \param Info the argument will be updated to provide additional information
 /// about template argument deduction.
 ///
+/// \param CheckNonDependent A callback to invoke to check conversions for
+/// non-dependent parameters, between deduction and substitution, per DR1391.
+/// If this returns true, substitution will be skipped and we return
+/// TDK_NonDependentConversionFailure. The callback is passed the parameter
+/// types (after substituting explicit template arguments).
+///
 /// \returns the result of template argument deduction.
 Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
     FunctionTemplateDecl *FunctionTemplate,
     TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef<Expr *> Args,
     FunctionDecl *&Specialization, TemplateDeductionInfo &Info,
-    bool PartialOverloading) {
+    bool PartialOverloading,
+    llvm::function_ref<bool(ArrayRef<QualType>)> CheckNonDependent) {
   if (FunctionTemplate->isInvalidDecl())
     return TDK_Invalid;
 
@@ -3389,7 +3426,6 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   //   Template argument deduction is done by comparing each function template
   //   parameter type (call it P) with the type of the corresponding argument
   //   of the call (call it A) as described below.
-  unsigned CheckArgs = Args.size();
   if (Args.size() < Function->getMinRequiredArguments() && !PartialOverloading)
     return TDK_TooFewArguments;
   else if (TooManyArguments(NumParams, Args.size(), PartialOverloading)) {
@@ -3397,9 +3433,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
       = Function->getType()->getAs<FunctionProtoType>();
     if (Proto->isTemplateVariadic())
       /* Do nothing */;
-    else if (Proto->isVariadic())
-      CheckArgs = NumParams;
-    else
+    else if (!Proto->isVariadic())
       return TDK_TooManyArguments;
   }
 
@@ -3409,7 +3443,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
   TemplateParameterList *TemplateParams
     = FunctionTemplate->getTemplateParameters();
   SmallVector<DeducedTemplateArgument, 4> Deduced;
-  SmallVector<QualType, 4> ParamTypes;
+  SmallVector<QualType, 8> ParamTypes;
   unsigned NumExplicitlySpecified = 0;
   if (ExplicitTemplateArgs) {
     TemplateDeductionResult Result =
@@ -3429,7 +3463,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
       ParamTypes.push_back(Function->getParamDecl(I)->getType());
   }
 
-  SmallVector<OriginalCallArg, 4> OriginalCallArgs;
+  SmallVector<OriginalCallArg, 8> OriginalCallArgs;
 
   // Deduce an argument of type ParamType from an expression with index ArgIdx.
   auto DeduceCallArgument = [&](QualType ParamType, unsigned ArgIdx) {
@@ -3448,6 +3482,7 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
 
   // Deduce template arguments from the function parameters.
   Deduced.resize(TemplateParams->size());
+  SmallVector<QualType, 8> ParamTypesForArgChecking;
   for (unsigned ParamIdx = 0, NumParamTypes = ParamTypes.size(), ArgIdx = 0;
        ParamIdx != NumParamTypes; ++ParamIdx) {
     QualType ParamType = ParamTypes[ParamIdx];
@@ -3456,51 +3491,68 @@ Sema::TemplateDeductionResult Sema::DeduceTemplateArguments(
         dyn_cast<PackExpansionType>(ParamType);
     if (!ParamExpansion) {
       // Simple case: matching a function parameter to a function argument.
-      if (ArgIdx >= CheckArgs)
+      if (ArgIdx >= Args.size())
         break;
 
+      ParamTypesForArgChecking.push_back(ParamType);
       if (auto Result = DeduceCallArgument(ParamType, ArgIdx++))
         return Result;
 
       continue;
     }
 
+    QualType ParamPattern = ParamExpansion->getPattern();
+    PackDeductionScope PackScope(*this, TemplateParams, Deduced, Info,
+                                 ParamPattern);
+
     // C++0x [temp.deduct.call]p1:
     //   For a function parameter pack that occurs at the end of the
     //   parameter-declaration-list, the type A of each remaining argument of
     //   the call is compared with the type P of the declarator-id of the
     //   function parameter pack. Each comparison deduces template arguments
     //   for subsequent positions in the template parameter packs expanded by
-    //   the function parameter pack. For a function parameter pack that does
-    //   not occur at the end of the parameter-declaration-list, the type of
-    //   the parameter pack is a non-deduced context.
-    // FIXME: This does not say that subsequent parameters are also non-deduced.
-    // See also DR1388 / DR1399, which effectively says we should keep deducing
-    // after the pack.
-    if (ParamIdx + 1 < NumParamTypes)
-      break;
-
-    QualType ParamPattern = ParamExpansion->getPattern();
-    PackDeductionScope PackScope(*this, TemplateParams, Deduced, Info,
-                                 ParamPattern);
-
-    for (; ArgIdx < Args.size(); PackScope.nextPackElement(), ++ArgIdx)
-      if (auto Result = DeduceCallArgument(ParamPattern, ArgIdx))
-        return Result;
+    //   the function parameter pack. When a function parameter pack appears
+    //   in a non-deduced context [not at the end of the list], the type of
+    //   that parameter pack is never deduced.
+    //
+    // FIXME: The above rule allows the size of the parameter pack to change
+    // after we skip it (in the non-deduced case). That makes no sense, so
+    // we instead notionally deduce the pack against N arguments, where N is
+    // the length of the explicitly-specified pack if it's expanded by the
+    // parameter pack and 0 otherwise, and we treat each deduction as a
+    // non-deduced context.
+    if (ParamIdx + 1 == NumParamTypes) {
+      for (; ArgIdx < Args.size(); PackScope.nextPackElement(), ++ArgIdx) {
+        ParamTypesForArgChecking.push_back(ParamPattern);
+        if (auto Result = DeduceCallArgument(ParamPattern, ArgIdx))
+          return Result;
+      }
+    } else {
+      // If the parameter type contains an explicitly-specified pack that we
+      // could not expand, skip the number of parameters notionally created
+      // by the expansion.
+      Optional<unsigned> NumExpansions = ParamExpansion->getNumExpansions();
+      if (NumExpansions && !PackScope.isPartiallyExpanded()) {
+        for (unsigned I = 0; I != *NumExpansions && ArgIdx < Args.size();
+             ++I, ++ArgIdx) {
+          ParamTypesForArgChecking.push_back(ParamPattern);
+          // FIXME: Should we add OriginalCallArgs for these? What if the
+          // corresponding argument is a list?
+          PackScope.nextPackElement();
+        }
+      }
+    }
 
     // Build argument packs for each of the parameter packs expanded by this
     // pack expansion.
     if (auto Result = PackScope.finish())
       return Result;
-
-    // After we've matching against a parameter pack, we're done.
-    break;
   }
 
-  return FinishTemplateArgumentDeduction(FunctionTemplate, Deduced,
-                                         NumExplicitlySpecified, Specialization,
-                                         Info, &OriginalCallArgs,
-                                         PartialOverloading);
+  return FinishTemplateArgumentDeduction(
+      FunctionTemplate, Deduced, NumExplicitlySpecified, Specialization, Info,
+      &OriginalCallArgs, PartialOverloading,
+      [&]() { return CheckNonDependent(ParamTypesForArgChecking); });
 }
 
 QualType Sema::adjustCCAndNoReturn(QualType ArgFunctionType,
@@ -4230,12 +4282,6 @@ bool Sema::DeduceReturnType(FunctionDecl *FD, SourceLocation Loc,
   return StillUndeduced;
 }
 
-static void
-MarkUsedTemplateParameters(ASTContext &Ctx, QualType T,
-                           bool OnlyDeduced,
-                           unsigned Level,
-                           llvm::SmallBitVector &Deduced);
-
 /// \brief If this is a non-static member function,
 static void
 AddImplicitObjectParameterType(ASTContext &Context,
diff --git a/lib/Sema/SemaTemplateInstantiate.cpp b/lib/Sema/SemaTemplateInstantiate.cpp
index 160c9f090788..ba4a5b7bc0d7 100644
--- a/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2280,16 +2280,18 @@ namespace {
   };
 }
 
-bool Sema::InstantiateClassTemplateSpecialization(
-    SourceLocation PointOfInstantiation,
+/// Get the instantiation pattern to use to instantiate the definition of a
+/// given ClassTemplateSpecializationDecl (either the pattern of the primary
+/// template or of a partial specialization).
+static CXXRecordDecl *
+getPatternForClassTemplateSpecialization(
+    Sema &S, SourceLocation PointOfInstantiation,
     ClassTemplateSpecializationDecl *ClassTemplateSpec,
     TemplateSpecializationKind TSK, bool Complain) {
-  // Perform the actual instantiation on the canonical declaration.
-  ClassTemplateSpec = cast<ClassTemplateSpecializationDecl>(
-                                         ClassTemplateSpec->getCanonicalDecl());
-  if (ClassTemplateSpec->isInvalidDecl())
-    return true;
-  
+  Sema::InstantiatingTemplate Inst(S, PointOfInstantiation, ClassTemplateSpec);
+  if (Inst.isInvalid() || Inst.isAlreadyInstantiating())
+    return nullptr;
+
   ClassTemplateDecl *Template = ClassTemplateSpec->getSpecializedTemplate();
   CXXRecordDecl *Pattern = nullptr;
 
@@ -2309,15 +2311,13 @@ bool Sema::InstantiateClassTemplateSpecialization(
   for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) {
     ClassTemplatePartialSpecializationDecl *Partial = PartialSpecs[I];
     TemplateDeductionInfo Info(FailedCandidates.getLocation());
-    if (TemplateDeductionResult Result
-          = DeduceTemplateArguments(Partial,
-                                    ClassTemplateSpec->getTemplateArgs(),
-                                    Info)) {
+    if (Sema::TemplateDeductionResult Result = S.DeduceTemplateArguments(
+            Partial, ClassTemplateSpec->getTemplateArgs(), Info)) {
       // Store the failed-deduction information for use in diagnostics, later.
       // TODO: Actually use the failed-deduction info?
       FailedCandidates.addCandidate().set(
           DeclAccessPair::make(Template, AS_public), Partial,
-          MakeDeductionFailureInfo(Context, Result, Info));
+          MakeDeductionFailureInfo(S.Context, Result, Info));
       (void)Result;
     } else {
       Matched.push_back(PartialSpecMatchResult());
@@ -2347,9 +2347,8 @@ bool Sema::InstantiateClassTemplateSpecialization(
       for (SmallVectorImpl<MatchResult>::iterator P = Best + 1,
                                                PEnd = Matched.end();
            P != PEnd; ++P) {
-        if (getMoreSpecializedPartialSpecialization(P->Partial, Best->Partial,
-                                                    PointOfInstantiation) 
-              == P->Partial)
+        if (S.getMoreSpecializedPartialSpecialization(
+                P->Partial, Best->Partial, PointOfInstantiation) == P->Partial)
           Best = P;
       }
       
@@ -2360,9 +2359,9 @@ bool Sema::InstantiateClassTemplateSpecialization(
                                                PEnd = Matched.end();
            P != PEnd; ++P) {
         if (P != Best &&
-            getMoreSpecializedPartialSpecialization(P->Partial, Best->Partial,
-                                                    PointOfInstantiation)
-              != Best->Partial) {
+            S.getMoreSpecializedPartialSpecialization(P->Partial, Best->Partial,
+                                                      PointOfInstantiation) !=
+                Best->Partial) {
           Ambiguous = true;
           break;
         }
@@ -2370,20 +2369,20 @@ bool Sema::InstantiateClassTemplateSpecialization(
        
       if (Ambiguous) {
         // Partial ordering did not produce a clear winner. Complain.
+        Inst.Clear();
         ClassTemplateSpec->setInvalidDecl();
-        Diag(PointOfInstantiation, diag::err_partial_spec_ordering_ambiguous)
+        S.Diag(PointOfInstantiation, diag::err_partial_spec_ordering_ambiguous)
           << ClassTemplateSpec;
         
         // Print the matching partial specializations.
         for (SmallVectorImpl<MatchResult>::iterator P = Matched.begin(),
                                                  PEnd = Matched.end();
              P != PEnd; ++P)
-          Diag(P->Partial->getLocation(), diag::note_partial_spec_match)
-            << getTemplateArgumentBindingsText(
-                                            P->Partial->getTemplateParameters(),
-                                               *P->Args);
+          S.Diag(P->Partial->getLocation(), diag::note_partial_spec_match)
+            << S.getTemplateArgumentBindingsText(
+                   P->Partial->getTemplateParameters(), *P->Args);
 
-        return true;
+        return nullptr;
       }
     }
     
@@ -2416,13 +2415,27 @@ bool Sema::InstantiateClassTemplateSpecialization(
     Pattern = OrigTemplate->getTemplatedDecl();
   }
 
-  bool Result = InstantiateClass(PointOfInstantiation, ClassTemplateSpec, 
-                                 Pattern,
-                                getTemplateInstantiationArgs(ClassTemplateSpec),
-                                 TSK,
-                                 Complain);
+  return Pattern;
+}
 
-  return Result;
+bool Sema::InstantiateClassTemplateSpecialization(
+    SourceLocation PointOfInstantiation,
+    ClassTemplateSpecializationDecl *ClassTemplateSpec,
+    TemplateSpecializationKind TSK, bool Complain) {
+  // Perform the actual instantiation on the canonical declaration.
+  ClassTemplateSpec = cast<ClassTemplateSpecializationDecl>(
+      ClassTemplateSpec->getCanonicalDecl());
+  if (ClassTemplateSpec->isInvalidDecl())
+    return true;
+
+  CXXRecordDecl *Pattern = getPatternForClassTemplateSpecialization(
+      *this, PointOfInstantiation, ClassTemplateSpec, TSK, Complain);
+  if (!Pattern)
+    return true;
+
+  return InstantiateClass(PointOfInstantiation, ClassTemplateSpec, Pattern,
+                          getTemplateInstantiationArgs(ClassTemplateSpec), TSK,
+                          Complain);
 }
 
 /// \brief Instantiates the definitions of all of the member
diff --git a/lib/Sema/SemaTemplateInstantiateDecl.cpp b/lib/Sema/SemaTemplateInstantiateDecl.cpp
index f4013b820641..d2a5e5cb5312 100644
--- a/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -168,39 +168,59 @@ static void instantiateDependentAlignValueAttr(
                         Aligned->getSpellingListIndex());
 }
 
-static void instantiateDependentEnableIfAttr(
+static Expr *instantiateDependentFunctionAttrCondition(
     Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
-    const EnableIfAttr *A, const Decl *Tmpl, Decl *New) {
+    const Attr *A, Expr *OldCond, const Decl *Tmpl, FunctionDecl *New) {
   Expr *Cond = nullptr;
   {
-    EnterExpressionEvaluationContext Unevaluated(S, Sema::Unevaluated);
-    ExprResult Result = S.SubstExpr(A->getCond(), TemplateArgs);
+    Sema::ContextRAII SwitchContext(S, New);
+    EnterExpressionEvaluationContext Unevaluated(S, Sema::ConstantEvaluated);
+    ExprResult Result = S.SubstExpr(OldCond, TemplateArgs);
     if (Result.isInvalid())
-      return;
+      return nullptr;
     Cond = Result.getAs<Expr>();
   }
   if (!Cond->isTypeDependent()) {
     ExprResult Converted = S.PerformContextuallyConvertToBool(Cond);
     if (Converted.isInvalid())
-      return;
+      return nullptr;
     Cond = Converted.get();
   }
 
   SmallVector<PartialDiagnosticAt, 8> Diags;
-  if (A->getCond()->isValueDependent() && !Cond->isValueDependent() &&
-      !Expr::isPotentialConstantExprUnevaluated(Cond, cast<FunctionDecl>(New),
-                                                Diags)) {
-    S.Diag(A->getLocation(), diag::err_enable_if_never_constant_expr);
-    for (int I = 0, N = Diags.size(); I != N; ++I)
-      S.Diag(Diags[I].first, Diags[I].second);
-    return;
+  if (OldCond->isValueDependent() && !Cond->isValueDependent() &&
+      !Expr::isPotentialConstantExprUnevaluated(Cond, New, Diags)) {
+    S.Diag(A->getLocation(), diag::err_attr_cond_never_constant_expr) << A;
+    for (const auto &P : Diags)
+      S.Diag(P.first, P.second);
+    return nullptr;
   }
+  return Cond;
+}
 
-  EnableIfAttr *EIA = new (S.getASTContext())
-                        EnableIfAttr(A->getLocation(), S.getASTContext(), Cond,
-                                     A->getMessage(),
-                                     A->getSpellingListIndex());
-  New->addAttr(EIA);
+static void instantiateDependentEnableIfAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const EnableIfAttr *EIA, const Decl *Tmpl, FunctionDecl *New) {
+  Expr *Cond = instantiateDependentFunctionAttrCondition(
+      S, TemplateArgs, EIA, EIA->getCond(), Tmpl, New);
+
+  if (Cond)
+    New->addAttr(new (S.getASTContext()) EnableIfAttr(
+        EIA->getLocation(), S.getASTContext(), Cond, EIA->getMessage(),
+        EIA->getSpellingListIndex()));
+}
+
+static void instantiateDependentDiagnoseIfAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const DiagnoseIfAttr *DIA, const Decl *Tmpl, FunctionDecl *New) {
+  Expr *Cond = instantiateDependentFunctionAttrCondition(
+      S, TemplateArgs, DIA, DIA->getCond(), Tmpl, New);
+
+  if (Cond)
+    New->addAttr(new (S.getASTContext()) DiagnoseIfAttr(
+        DIA->getLocation(), S.getASTContext(), Cond, DIA->getMessage(),
+        DIA->getDiagnosticType(), DIA->getArgDependent(), New,
+        DIA->getSpellingListIndex()));
 }
 
 // Constructs and adds to New a new instance of CUDALaunchBoundsAttr using
@@ -334,7 +354,13 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
 
     if (const auto *EnableIf = dyn_cast<EnableIfAttr>(TmplAttr)) {
       instantiateDependentEnableIfAttr(*this, TemplateArgs, EnableIf, Tmpl,
-                                       New);
+                                       cast<FunctionDecl>(New));
+      continue;
+    }
+
+    if (const auto *DiagnoseIf = dyn_cast<DiagnoseIfAttr>(TmplAttr)) {
+      instantiateDependentDiagnoseIfAttr(*this, TemplateArgs, DiagnoseIf, Tmpl,
+                                         cast<FunctionDecl>(New));
       continue;
     }
 
diff --git a/lib/Serialization/ASTWriter.cpp b/lib/Serialization/ASTWriter.cpp
index 2a5eda436f09..39e842db2baa 100644
--- a/lib/Serialization/ASTWriter.cpp
+++ b/lib/Serialization/ASTWriter.cpp
@@ -4654,17 +4654,6 @@ uint64_t ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot,
   // If we're emitting a module, write out the submodule information.  
   if (WritingModule)
     WriteSubmodules(WritingModule);
-  else if (!getLangOpts().CurrentModule.empty()) {
-    // If we're building a PCH in the implementation of a module, we may need
-    // the description of the current module.
-    //
-    // FIXME: We may need other modules that we did not load from an AST file,
-    // such as if a module declares a 'conflicts' on a different module.
-    Module *M = PP.getHeaderSearchInfo().getModuleMap().findModule(
-        getLangOpts().CurrentModule);
-    if (M && !M->IsFromModuleFile)
-      WriteSubmodules(M);
-  }
 
   Stream.EmitRecord(SPECIAL_TYPES, SpecialTypes);
 
diff --git a/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/lib/StaticAnalyzer/Checkers/CMakeLists.txt
index 41415f0376c0..05505ec38600 100644
--- a/lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ b/lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -39,6 +39,7 @@ add_clang_library(clangStaticAnalyzerCheckers
   GenericTaintChecker.cpp
   GTestChecker.cpp
   IdenticalExprChecker.cpp
+  IteratorPastEndChecker.cpp
   IvarInvalidationChecker.cpp
   LLVMConventionsChecker.cpp
   LocalizationChecker.cpp
diff --git a/lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp b/lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp
new file mode 100644
index 000000000000..531054aa7887
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp
@@ -0,0 +1,842 @@
+//===-- IteratorPastEndChecker.cpp --------------------------------*- C++ -*--//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines a checker for using iterators outside their range (past end). Usage
+// means here dereferencing, incrementing etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ClangSACheckers.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+#include <utility>
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+struct IteratorPosition {
+private:
+  enum Kind { InRange, OutofRange } K;
+  IteratorPosition(Kind InK) : K(InK) {}
+
+public:
+  bool isInRange() const { return K == InRange; }
+  bool isOutofRange() const { return K == OutofRange; }
+
+  static IteratorPosition getInRange() { return IteratorPosition(InRange); }
+  static IteratorPosition getOutofRange() {
+    return IteratorPosition(OutofRange);
+  }
+
+  bool operator==(const IteratorPosition &X) const { return K == X.K; }
+  bool operator!=(const IteratorPosition &X) const { return K != X.K; }
+  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(K); }
+};
+
+typedef llvm::PointerUnion<const MemRegion *, SymbolRef> RegionOrSymbol;
+
+struct IteratorComparison {
+private:
+  RegionOrSymbol Left, Right;
+  bool Equality;
+
+public:
+  IteratorComparison(RegionOrSymbol L, RegionOrSymbol R, bool Eq)
+      : Left(L), Right(R), Equality(Eq) {}
+
+  RegionOrSymbol getLeft() const { return Left; }
+  RegionOrSymbol getRight() const { return Right; }
+  bool isEquality() const { return Equality; }
+  bool operator==(const IteratorComparison &X) const {
+    return Left == X.Left && Right == X.Right && Equality == X.Equality;
+  }
+  bool operator!=(const IteratorComparison &X) const {
+    return Left != X.Left || Right != X.Right || Equality != X.Equality;
+  }
+  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(Equality); }
+};
+
+class IteratorPastEndChecker
+    : public Checker<
+          check::PreCall, check::PostCall, check::PreStmt<CXXOperatorCallExpr>,
+          check::PostStmt<CXXConstructExpr>, check::PostStmt<DeclStmt>,
+          check::PostStmt<MaterializeTemporaryExpr>, check::BeginFunction,
+          check::DeadSymbols, eval::Assume, eval::Call> {
+  mutable IdentifierInfo *II_find = nullptr,
+                         *II_find_end = nullptr, *II_find_first_of = nullptr,
+                         *II_find_if = nullptr, *II_find_if_not = nullptr,
+                         *II_lower_bound = nullptr, *II_upper_bound = nullptr,
+                         *II_search = nullptr, *II_search_n = nullptr;
+
+  std::unique_ptr<BugType> PastEndBugType;
+
+  void handleComparison(CheckerContext &C, const SVal &RetVal, const SVal &LVal,
+                        const SVal &RVal, OverloadedOperatorKind Op) const;
+  void handleAccess(CheckerContext &C, const SVal &Val) const;
+  void handleDecrement(CheckerContext &C, const SVal &Val) const;
+  void handleEnd(CheckerContext &C, const SVal &RetVal) const;
+
+  bool evalFind(CheckerContext &C, const CallExpr *CE) const;
+  bool evalFindEnd(CheckerContext &C, const CallExpr *CE) const;
+  bool evalFindFirstOf(CheckerContext &C, const CallExpr *CE) const;
+  bool evalFindIf(CheckerContext &C, const CallExpr *CE) const;
+  bool evalFindIfNot(CheckerContext &C, const CallExpr *CE) const;
+  bool evalLowerBound(CheckerContext &C, const CallExpr *CE) const;
+  bool evalUpperBound(CheckerContext &C, const CallExpr *CE) const;
+  bool evalSearch(CheckerContext &C, const CallExpr *CE) const;
+  bool evalSearchN(CheckerContext &C, const CallExpr *CE) const;
+  void Find(CheckerContext &C, const CallExpr *CE) const;
+
+  void reportPastEndBug(const StringRef &Message, const SVal &Val,
+                        CheckerContext &C, ExplodedNode *ErrNode) const;
+  void initIdentifiers(ASTContext &Ctx) const;
+
+public:
+  IteratorPastEndChecker();
+
+  void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
+  void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
+  void checkPreStmt(const CXXOperatorCallExpr *COCE, CheckerContext &C) const;
+  void checkBeginFunction(CheckerContext &C) const;
+  void checkPostStmt(const CXXConstructExpr *CCE, CheckerContext &C) const;
+  void checkPostStmt(const DeclStmt *DS, CheckerContext &C) const;
+  void checkPostStmt(const MaterializeTemporaryExpr *MTE,
+                     CheckerContext &C) const;
+  void checkDeadSymbols(SymbolReaper &SR, CheckerContext &C) const;
+  ProgramStateRef evalAssume(ProgramStateRef State, SVal Cond,
+                             bool Assumption) const;
+  bool evalCall(const CallExpr *CE, CheckerContext &C) const;
+};
+}
+
+REGISTER_MAP_WITH_PROGRAMSTATE(IteratorSymbolMap, SymbolRef, IteratorPosition)
+REGISTER_MAP_WITH_PROGRAMSTATE(IteratorRegionMap, const MemRegion *,
+                               IteratorPosition)
+
+REGISTER_MAP_WITH_PROGRAMSTATE(IteratorComparisonMap, const SymExpr *,
+                               IteratorComparison)
+
+#define INIT_ID(Id)                                                            \
+  if (!II_##Id)                                                                \
+  II_##Id = &Ctx.Idents.get(#Id)
+
+namespace {
+
+bool isIteratorType(const QualType &Type);
+bool isIterator(const CXXRecordDecl *CRD);
+bool isEndCall(const FunctionDecl *Func);
+bool isSimpleComparisonOperator(OverloadedOperatorKind OK);
+bool isAccessOperator(OverloadedOperatorKind OK);
+bool isDecrementOperator(OverloadedOperatorKind OK);
+BinaryOperator::Opcode getOpcode(const SymExpr *SE);
+const RegionOrSymbol getRegionOrSymbol(const SVal &Val);
+const ProgramStateRef processComparison(ProgramStateRef State,
+                                        RegionOrSymbol LVal,
+                                        RegionOrSymbol RVal, bool Equal);
+const ProgramStateRef saveComparison(ProgramStateRef State,
+                                     const SymExpr *Condition, const SVal &LVal,
+                                     const SVal &RVal, bool Eq);
+const IteratorComparison *loadComparison(ProgramStateRef State,
+                                         const SymExpr *Condition);
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            const SVal &Val);
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            RegionOrSymbol RegOrSym);
+ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
+                                    IteratorPosition Pos);
+ProgramStateRef setIteratorPosition(ProgramStateRef State,
+                                    RegionOrSymbol RegOrSym,
+                                    IteratorPosition Pos);
+ProgramStateRef adjustIteratorPosition(ProgramStateRef State,
+                                       RegionOrSymbol RegOrSym,
+                                       IteratorPosition Pos, bool Equal);
+bool contradictingIteratorPositions(IteratorPosition Pos1,
+                                    IteratorPosition Pos2, bool Equal);
+}
+
+IteratorPastEndChecker::IteratorPastEndChecker() {
+  PastEndBugType.reset(
+      new BugType(this, "Iterator Past End", "Misuse of STL APIs"));
+  PastEndBugType->setSuppressOnSink(true);
+}
+
+void IteratorPastEndChecker::checkPreCall(const CallEvent &Call,
+                                          CheckerContext &C) const {
+  // Check for access past end
+  const auto *Func = Call.getDecl()->getAsFunction();
+  if (!Func)
+    return;
+  if (Func->isOverloadedOperator()) {
+    if (isAccessOperator(Func->getOverloadedOperator())) {
+      if (const auto *InstCall = dyn_cast<CXXInstanceCall>(&Call)) {
+        handleAccess(C, InstCall->getCXXThisVal());
+      } else {
+        handleAccess(C, Call.getArgSVal(0));
+      }
+    }
+  }
+}
+
+void IteratorPastEndChecker::checkPostCall(const CallEvent &Call,
+                                           CheckerContext &C) const {
+  // Record end() iterators, iterator decrementation and comparison
+  const auto *Func = Call.getDecl()->getAsFunction();
+  if (!Func)
+    return;
+  if (Func->isOverloadedOperator()) {
+    const auto Op = Func->getOverloadedOperator();
+    if (isSimpleComparisonOperator(Op)) {
+      if (Func->isCXXInstanceMember()) {
+        const auto &InstCall = static_cast<const CXXInstanceCall &>(Call);
+        handleComparison(C, InstCall.getReturnValue(), InstCall.getCXXThisVal(),
+                         InstCall.getArgSVal(0), Op);
+      } else {
+        handleComparison(C, Call.getReturnValue(), Call.getArgSVal(0),
+                         Call.getArgSVal(1), Op);
+      }
+    } else if (isDecrementOperator(Func->getOverloadedOperator())) {
+      if (Func->isCXXInstanceMember()) {
+        const auto &InstCall = static_cast<const CXXInstanceCall &>(Call);
+        handleDecrement(C, InstCall.getCXXThisVal());
+      } else {
+        handleDecrement(C, Call.getArgSVal(0));
+      }
+    }
+  } else if (Func->isCXXInstanceMember()) {
+    if (!isEndCall(Func))
+      return;
+    if (!isIteratorType(Call.getResultType()))
+      return;
+    handleEnd(C, Call.getReturnValue());
+  }
+}
+
+void IteratorPastEndChecker::checkPreStmt(const CXXOperatorCallExpr *COCE,
+                                          CheckerContext &C) const {
+  const auto *ThisExpr = COCE->getArg(0);
+
+  auto State = C.getState();
+  const auto *LCtx = C.getPredecessor()->getLocationContext();
+
+  const auto CurrentThis = State->getSVal(ThisExpr, LCtx);
+  if (const auto *Reg = CurrentThis.getAsRegion()) {
+    if (!Reg->getAs<CXXTempObjectRegion>())
+      return;
+    const auto OldState = C.getPredecessor()->getFirstPred()->getState();
+    const auto OldThis = OldState->getSVal(ThisExpr, LCtx);
+    const auto *Pos = getIteratorPosition(OldState, OldThis);
+    if (!Pos)
+      return;
+    State = setIteratorPosition(State, CurrentThis, *Pos);
+    C.addTransition(State);
+  }
+}
+
+void IteratorPastEndChecker::checkBeginFunction(CheckerContext &C) const {
+  // Copy state of iterator arguments to iterator parameters
+  auto State = C.getState();
+  const auto *LCtx = C.getLocationContext();
+
+  const auto *Site = cast<StackFrameContext>(LCtx)->getCallSite();
+  if (!Site)
+    return;
+
+  const auto *FD = dyn_cast<FunctionDecl>(LCtx->getDecl());
+  if (!FD)
+    return;
+
+  const auto *CE = dyn_cast<CallExpr>(Site);
+  if (!CE)
+    return;
+
+  bool Change = false;
+  int idx = 0;
+  for (const auto P : FD->parameters()) {
+    auto Param = State->getLValue(P, LCtx);
+    auto Arg = State->getSVal(CE->getArg(idx++), LCtx->getParent());
+    const auto *Pos = getIteratorPosition(State, Arg);
+    if (!Pos)
+      continue;
+    State = setIteratorPosition(State, Param, *Pos);
+    Change = true;
+  }
+  if (Change) {
+    C.addTransition(State);
+  }
+}
+
+void IteratorPastEndChecker::checkPostStmt(const CXXConstructExpr *CCE,
+                                           CheckerContext &C) const {
+  // Transfer iterator state in case of copy or move by constructor
+  const auto *ctr = CCE->getConstructor();
+  if (!ctr->isCopyOrMoveConstructor())
+    return;
+  const auto *RHSExpr = CCE->getArg(0);
+
+  auto State = C.getState();
+  const auto *LCtx = C.getLocationContext();
+
+  const auto RetVal = State->getSVal(CCE, LCtx);
+
+  const auto RHSVal = State->getSVal(RHSExpr, LCtx);
+  const auto *RHSPos = getIteratorPosition(State, RHSVal);
+  if (!RHSPos)
+    return;
+  State = setIteratorPosition(State, RetVal, *RHSPos);
+  C.addTransition(State);
+}
+
+void IteratorPastEndChecker::checkPostStmt(const DeclStmt *DS,
+                                           CheckerContext &C) const {
+  // Transfer iterator state to new variable declaration
+  for (const auto *D : DS->decls()) {
+    const auto *VD = dyn_cast<VarDecl>(D);
+    if (!VD || !VD->hasInit())
+      continue;
+
+    auto State = C.getState();
+    const auto *LCtx = C.getPredecessor()->getLocationContext();
+    const auto *Pos =
+        getIteratorPosition(State, State->getSVal(VD->getInit(), LCtx));
+    if (!Pos)
+      continue;
+    State = setIteratorPosition(State, State->getLValue(VD, LCtx), *Pos);
+    C.addTransition(State);
+  }
+}
+
+void IteratorPastEndChecker::checkPostStmt(const MaterializeTemporaryExpr *MTE,
+                                           CheckerContext &C) const {
+  /* Transfer iterator state for to temporary objects */
+  auto State = C.getState();
+  const auto *LCtx = C.getPredecessor()->getLocationContext();
+  const auto *Pos =
+      getIteratorPosition(State, State->getSVal(MTE->GetTemporaryExpr(), LCtx));
+  if (!Pos)
+    return;
+  State = setIteratorPosition(State, State->getSVal(MTE, LCtx), *Pos);
+  C.addTransition(State);
+}
+
+void IteratorPastEndChecker::checkDeadSymbols(SymbolReaper &SR,
+                                              CheckerContext &C) const {
+  auto State = C.getState();
+
+  auto RegionMap = State->get<IteratorRegionMap>();
+  for (const auto Reg : RegionMap) {
+    if (!SR.isLiveRegion(Reg.first)) {
+      State = State->remove<IteratorRegionMap>(Reg.first);
+    }
+  }
+
+  auto SymbolMap = State->get<IteratorSymbolMap>();
+  for (const auto Sym : SymbolMap) {
+    if (SR.isDead(Sym.first)) {
+      State = State->remove<IteratorSymbolMap>(Sym.first);
+    }
+  }
+
+  auto ComparisonMap = State->get<IteratorComparisonMap>();
+  for (const auto Comp : ComparisonMap) {
+    if (SR.isDead(Comp.first)) {
+      State = State->remove<IteratorComparisonMap>(Comp.first);
+    }
+  }
+}
+
+ProgramStateRef IteratorPastEndChecker::evalAssume(ProgramStateRef State,
+                                                   SVal Cond,
+                                                   bool Assumption) const {
+  // Load recorded comparison and transfer iterator state between sides
+  // according to comparison operator and assumption
+  const auto *SE = Cond.getAsSymExpr();
+  if (!SE)
+    return State;
+
+  auto Opc = getOpcode(SE);
+  if (Opc != BO_EQ && Opc != BO_NE)
+    return State;
+
+  bool Negated = false;
+  const auto *Comp = loadComparison(State, SE);
+  if (!Comp) {
+    // Try negated comparison, which is a SymExpr to 0 integer comparison
+    const auto *SIE = dyn_cast<SymIntExpr>(SE);
+    if (!SIE)
+      return State;
+
+    if (SIE->getRHS() != 0)
+      return State;
+
+    SE = SIE->getLHS();
+    Negated = SIE->getOpcode() == BO_EQ; // Equal to zero means negation
+    Opc = getOpcode(SE);
+    if (Opc != BO_EQ && Opc != BO_NE)
+      return State;
+
+    Comp = loadComparison(State, SE);
+    if (!Comp)
+      return State;
+  }
+
+  return processComparison(State, Comp->getLeft(), Comp->getRight(),
+                           (Comp->isEquality() == Assumption) != Negated);
+}
+
+// FIXME: Evaluation of these STL calls should be moved to StdCLibraryFunctions
+//       checker (see patch r284960) or another similar checker for C++ STL
+//       functions (e.g. StdCXXLibraryFunctions or StdCppLibraryFunctions).
+bool IteratorPastEndChecker::evalCall(const CallExpr *CE,
+                                      CheckerContext &C) const {
+  const FunctionDecl *FD = C.getCalleeDecl(CE);
+  if (!FD)
+    return false;
+
+  ASTContext &Ctx = C.getASTContext();
+  initIdentifiers(Ctx);
+
+  if (FD->getKind() == Decl::Function) {
+    if (FD->isInStdNamespace()) {
+      if (FD->getIdentifier() == II_find) {
+        return evalFind(C, CE);
+      } else if (FD->getIdentifier() == II_find_end) {
+        return evalFindEnd(C, CE);
+      } else if (FD->getIdentifier() == II_find_first_of) {
+        return evalFindFirstOf(C, CE);
+      } else if (FD->getIdentifier() == II_find_if) {
+        return evalFindIf(C, CE);
+      } else if (FD->getIdentifier() == II_find_if) {
+        return evalFindIf(C, CE);
+      } else if (FD->getIdentifier() == II_find_if_not) {
+        return evalFindIfNot(C, CE);
+      } else if (FD->getIdentifier() == II_upper_bound) {
+        return evalUpperBound(C, CE);
+      } else if (FD->getIdentifier() == II_lower_bound) {
+        return evalLowerBound(C, CE);
+      } else if (FD->getIdentifier() == II_search) {
+        return evalSearch(C, CE);
+      } else if (FD->getIdentifier() == II_search_n) {
+        return evalSearchN(C, CE);
+      }
+    }
+  }
+
+  return false;
+}
+
+void IteratorPastEndChecker::handleComparison(CheckerContext &C,
+                                              const SVal &RetVal,
+                                              const SVal &LVal,
+                                              const SVal &RVal,
+                                              OverloadedOperatorKind Op) const {
+  // Record the operands and the operator of the comparison for the next
+  // evalAssume, if the result is a symbolic expression. If it is a concrete
+  // value (only one branch is possible), then transfer the state between
+  // the operands according to the operator and the result
+  auto State = C.getState();
+  if (const auto *Condition = RetVal.getAsSymbolicExpression()) {
+    const auto *LPos = getIteratorPosition(State, LVal);
+    const auto *RPos = getIteratorPosition(State, RVal);
+    if (!LPos && !RPos)
+      return;
+    State = saveComparison(State, Condition, LVal, RVal, Op == OO_EqualEqual);
+    C.addTransition(State);
+  } else if (const auto TruthVal = RetVal.getAs<nonloc::ConcreteInt>()) {
+    if ((State = processComparison(
+             State, getRegionOrSymbol(LVal), getRegionOrSymbol(RVal),
+             (Op == OO_EqualEqual) == (TruthVal->getValue() != 0)))) {
+      C.addTransition(State);
+    } else {
+      C.generateSink(State, C.getPredecessor());
+    }
+  }
+}
+
+void IteratorPastEndChecker::handleAccess(CheckerContext &C,
+                                          const SVal &Val) const {
+  auto State = C.getState();
+  const auto *Pos = getIteratorPosition(State, Val);
+  if (Pos && Pos->isOutofRange()) {
+    auto *N = C.generateNonFatalErrorNode(State);
+    if (!N) {
+      return;
+    }
+    reportPastEndBug("Iterator accessed past its end.", Val, C, N);
+  }
+}
+
+void IteratorPastEndChecker::handleDecrement(CheckerContext &C,
+                                             const SVal &Val) const {
+  auto State = C.getState();
+  const auto *Pos = getIteratorPosition(State, Val);
+  if (Pos && Pos->isOutofRange()) {
+    State = setIteratorPosition(State, Val, IteratorPosition::getInRange());
+    // FIXME: We could also check for iterators ahead of their beginnig in the
+    //       future, but currently we do not care for such errors. We also
+    //       assume that the iterator is not past its end by more then one
+    //       position.
+    C.addTransition(State);
+  }
+}
+
+void IteratorPastEndChecker::handleEnd(CheckerContext &C,
+                                       const SVal &RetVal) const {
+  auto State = C.getState();
+  State = setIteratorPosition(State, RetVal, IteratorPosition::getOutofRange());
+  C.addTransition(State);
+}
+
+bool IteratorPastEndChecker::evalFind(CheckerContext &C,
+                                      const CallExpr *CE) const {
+  if (CE->getNumArgs() == 3 && isIteratorType(CE->getArg(0)->getType()) &&
+      isIteratorType(CE->getArg(1)->getType())) {
+    Find(C, CE);
+    return true;
+  }
+  return false;
+}
+
+bool IteratorPastEndChecker::evalFindEnd(CheckerContext &C,
+                                         const CallExpr *CE) const {
+  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
+      isIteratorType(CE->getArg(0)->getType()) &&
+      isIteratorType(CE->getArg(1)->getType()) &&
+      isIteratorType(CE->getArg(2)->getType()) &&
+      isIteratorType(CE->getArg(3)->getType())) {
+    Find(C, CE);
+    return true;
+  }
+  return false;
+}
+
+bool IteratorPastEndChecker::evalFindFirstOf(CheckerContext &C,
+                                             const CallExpr *CE) const {
+  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
+      isIteratorType(CE->getArg(0)->getType()) &&
+      isIteratorType(CE->getArg(1)->getType()) &&
+      isIteratorType(CE->getArg(2)->getType()) &&
+      isIteratorType(CE->getArg(3)->getType())) {
+    Find(C, CE);
+    return true;
+  }
+  return false;
+}
+
+bool IteratorPastEndChecker::evalFindIf(CheckerContext &C,
+                                        const CallExpr *CE) const {
+  if (CE->getNumArgs() == 3 && isIteratorType(CE->getArg(0)->getType()) &&
+      isIteratorType(CE->getArg(1)->getType())) {
+    Find(C, CE);
+    return true;
+  }
+  return false;
+}
+
+bool IteratorPastEndChecker::evalFindIfNot(CheckerContext &C,
+                                           const CallExpr *CE) const {
+  if (CE->getNumArgs() == 3 && isIteratorType(CE->getArg(0)->getType()) &&
+      isIteratorType(CE->getArg(1)->getType())) {
+    Find(C, CE);
+    return true;
+  }
+  return false;
+}
+
+bool IteratorPastEndChecker::evalLowerBound(CheckerContext &C,
+                                            const CallExpr *CE) const {
+  if ((CE->getNumArgs() == 3 || CE->getNumArgs() == 4) &&
+      isIteratorType(CE->getArg(0)->getType()) &&
+      isIteratorType(CE->getArg(1)->getType())) {
+    Find(C, CE);
+    return true;
+  }
+  return false;
+}
+
+bool IteratorPastEndChecker::evalUpperBound(CheckerContext &C,
+                                            const CallExpr *CE) const {
+  if ((CE->getNumArgs() == 3 || CE->getNumArgs() == 4) &&
+      isIteratorType(CE->getArg(0)->getType()) &&
+      isIteratorType(CE->getArg(1)->getType())) {
+    Find(C, CE);
+    return true;
+  }
+  return false;
+}
+
+bool IteratorPastEndChecker::evalSearch(CheckerContext &C,
+                                        const CallExpr *CE) const {
+  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
+      isIteratorType(CE->getArg(0)->getType()) &&
+      isIteratorType(CE->getArg(1)->getType()) &&
+      isIteratorType(CE->getArg(2)->getType()) &&
+      isIteratorType(CE->getArg(3)->getType())) {
+    Find(C, CE);
+    return true;
+  }
+  return false;
+}
+
+bool IteratorPastEndChecker::evalSearchN(CheckerContext &C,
+                                         const CallExpr *CE) const {
+  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
+      isIteratorType(CE->getArg(0)->getType()) &&
+      isIteratorType(CE->getArg(1)->getType())) {
+    Find(C, CE);
+    return true;
+  }
+  return false;
+}
+
+void IteratorPastEndChecker::Find(CheckerContext &C, const CallExpr *CE) const {
+  auto state = C.getState();
+  auto &svalBuilder = C.getSValBuilder();
+  const auto *LCtx = C.getLocationContext();
+
+  auto RetVal = svalBuilder.conjureSymbolVal(nullptr, CE, LCtx, C.blockCount());
+  auto SecondParam = state->getSVal(CE->getArg(1), LCtx);
+
+  auto stateFound = state->BindExpr(CE, LCtx, RetVal);
+  auto stateNotFound = state->BindExpr(CE, LCtx, SecondParam);
+
+  C.addTransition(stateFound);
+  C.addTransition(stateNotFound);
+}
+
+void IteratorPastEndChecker::reportPastEndBug(const StringRef &Message,
+                                              const SVal &Val,
+                                              CheckerContext &C,
+                                              ExplodedNode *ErrNode) const {
+  auto R = llvm::make_unique<BugReport>(*PastEndBugType, Message, ErrNode);
+  R->markInteresting(Val);
+  C.emitReport(std::move(R));
+}
+
+void IteratorPastEndChecker::initIdentifiers(ASTContext &Ctx) const {
+  INIT_ID(find);
+  INIT_ID(find_end);
+  INIT_ID(find_first_of);
+  INIT_ID(find_if);
+  INIT_ID(find_if_not);
+  INIT_ID(lower_bound);
+  INIT_ID(upper_bound);
+  INIT_ID(search);
+  INIT_ID(search_n);
+}
+
+namespace {
+
+bool isIteratorType(const QualType &Type) {
+  if (Type->isPointerType())
+    return true;
+
+  const auto *CRD = Type->getUnqualifiedDesugaredType()->getAsCXXRecordDecl();
+  return isIterator(CRD);
+}
+
+bool isIterator(const CXXRecordDecl *CRD) {
+  if (!CRD)
+    return false;
+
+  const auto Name = CRD->getName();
+  if (!(Name.endswith_lower("iterator") || Name.endswith_lower("iter") ||
+        Name.endswith_lower("it")))
+    return false;
+
+  bool HasCopyCtor = false, HasCopyAssign = true, HasDtor = false,
+       HasPreIncrOp = false, HasPostIncrOp = false, HasDerefOp = false;
+  for (const auto *Method : CRD->methods()) {
+    if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(Method)) {
+      if (Ctor->isCopyConstructor()) {
+        HasCopyCtor = !Ctor->isDeleted() && Ctor->getAccess() == AS_public;
+      }
+      continue;
+    }
+    if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(Method)) {
+      HasDtor = !Dtor->isDeleted() && Dtor->getAccess() == AS_public;
+      continue;
+    }
+    if (Method->isCopyAssignmentOperator()) {
+      HasCopyAssign = !Method->isDeleted() && Method->getAccess() == AS_public;
+      continue;
+    }
+    if (!Method->isOverloadedOperator())
+      continue;
+    const auto OPK = Method->getOverloadedOperator();
+    if (OPK == OO_PlusPlus) {
+      HasPreIncrOp = HasPreIncrOp || (Method->getNumParams() == 0);
+      HasPostIncrOp = HasPostIncrOp || (Method->getNumParams() == 1);
+      continue;
+    }
+    if (OPK == OO_Star) {
+      HasDerefOp = (Method->getNumParams() == 0);
+      continue;
+    }
+  }
+
+  return HasCopyCtor && HasCopyAssign && HasDtor && HasPreIncrOp &&
+         HasPostIncrOp && HasDerefOp;
+}
+
+bool isEndCall(const FunctionDecl *Func) {
+  const auto *IdInfo = Func->getIdentifier();
+  if (!IdInfo)
+    return false;
+  return IdInfo->getName().endswith_lower("end");
+}
+
+bool isSimpleComparisonOperator(OverloadedOperatorKind OK) {
+  return OK == OO_EqualEqual || OK == OO_ExclaimEqual;
+}
+
+bool isAccessOperator(OverloadedOperatorKind OK) {
+  return OK == OO_Star || OK == OO_Arrow || OK == OO_ArrowStar ||
+         OK == OO_Plus || OK == OO_PlusEqual || OK == OO_PlusPlus ||
+         OK == OO_Subscript;
+}
+
+bool isDecrementOperator(OverloadedOperatorKind OK) {
+  return OK == OO_MinusEqual || OK == OO_MinusMinus;
+}
+
+BinaryOperator::Opcode getOpcode(const SymExpr *SE) {
+  if (const auto *BSE = dyn_cast<BinarySymExpr>(SE)) {
+    return BSE->getOpcode();
+  } else if (const auto *SC = dyn_cast<SymbolConjured>(SE)) {
+    const auto *COE = dyn_cast<CXXOperatorCallExpr>(SC->getStmt());
+    if (!COE)
+      return BO_Comma; // Extremal value, neither EQ nor NE
+    if (COE->getOperator() == OO_EqualEqual) {
+      return BO_EQ;
+    } else if (COE->getOperator() == OO_ExclaimEqual) {
+      return BO_NE;
+    }
+    return BO_Comma; // Extremal value, neither EQ nor NE
+  }
+  return BO_Comma; // Extremal value, neither EQ nor NE
+}
+
+const RegionOrSymbol getRegionOrSymbol(const SVal &Val) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return Reg;
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return Sym;
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return LCVal->getRegion();
+  }
+  return RegionOrSymbol();
+}
+
+const ProgramStateRef processComparison(ProgramStateRef State,
+                                        RegionOrSymbol LVal,
+                                        RegionOrSymbol RVal, bool Equal) {
+  const auto *LPos = getIteratorPosition(State, LVal);
+  const auto *RPos = getIteratorPosition(State, RVal);
+  if (LPos && !RPos) {
+    State = adjustIteratorPosition(State, RVal, *LPos, Equal);
+  } else if (!LPos && RPos) {
+    State = adjustIteratorPosition(State, LVal, *RPos, Equal);
+  } else if (LPos && RPos) {
+    if (contradictingIteratorPositions(*LPos, *RPos, Equal)) {
+      return nullptr;
+    }
+  }
+  return State;
+}
+
+const ProgramStateRef saveComparison(ProgramStateRef State,
+                                     const SymExpr *Condition, const SVal &LVal,
+                                     const SVal &RVal, bool Eq) {
+  const auto Left = getRegionOrSymbol(LVal);
+  const auto Right = getRegionOrSymbol(RVal);
+  if (!Left || !Right)
+    return State;
+  return State->set<IteratorComparisonMap>(Condition,
+                                           IteratorComparison(Left, Right, Eq));
+}
+
+const IteratorComparison *loadComparison(ProgramStateRef State,
+                                         const SymExpr *Condition) {
+  return State->get<IteratorComparisonMap>(Condition);
+}
+
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            const SVal &Val) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return State->get<IteratorRegionMap>(Reg);
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return State->get<IteratorSymbolMap>(Sym);
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return State->get<IteratorRegionMap>(LCVal->getRegion());
+  }
+  return nullptr;
+}
+
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            RegionOrSymbol RegOrSym) {
+  if (RegOrSym.is<const MemRegion *>()) {
+    return State->get<IteratorRegionMap>(RegOrSym.get<const MemRegion *>());
+  } else if (RegOrSym.is<SymbolRef>()) {
+    return State->get<IteratorSymbolMap>(RegOrSym.get<SymbolRef>());
+  }
+  return nullptr;
+}
+
+ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
+                                    IteratorPosition Pos) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return State->set<IteratorRegionMap>(Reg, Pos);
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return State->set<IteratorSymbolMap>(Sym, Pos);
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return State->set<IteratorRegionMap>(LCVal->getRegion(), Pos);
+  }
+  return nullptr;
+}
+
+ProgramStateRef setIteratorPosition(ProgramStateRef State,
+                                    RegionOrSymbol RegOrSym,
+                                    IteratorPosition Pos) {
+  if (RegOrSym.is<const MemRegion *>()) {
+    return State->set<IteratorRegionMap>(RegOrSym.get<const MemRegion *>(),
+                                         Pos);
+  } else if (RegOrSym.is<SymbolRef>()) {
+    return State->set<IteratorSymbolMap>(RegOrSym.get<SymbolRef>(), Pos);
+  }
+  return nullptr;
+}
+
+ProgramStateRef adjustIteratorPosition(ProgramStateRef State,
+                                       RegionOrSymbol RegOrSym,
+                                       IteratorPosition Pos, bool Equal) {
+
+  if ((Pos.isInRange() && Equal) || (Pos.isOutofRange() && !Equal)) {
+    return setIteratorPosition(State, RegOrSym, IteratorPosition::getInRange());
+  } else if (Pos.isOutofRange() && Equal) {
+    return setIteratorPosition(State, RegOrSym,
+                               IteratorPosition::getOutofRange());
+  } else {
+    return State;
+  }
+}
+
+bool contradictingIteratorPositions(IteratorPosition Pos1,
+                                    IteratorPosition Pos2, bool Equal) {
+  return ((Pos1 != Pos2) && Equal) ||
+         ((Pos1.isOutofRange() && Pos2.isOutofRange()) && !Equal);
+}
+}
+
+void ento::registerIteratorPastEndChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<IteratorPastEndChecker>();
+}
diff --git a/lib/StaticAnalyzer/Core/ExprEngine.cpp b/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 707168b4de0a..7e7e329dc4d7 100644
--- a/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1248,7 +1248,14 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Expr::MaterializeTemporaryExprClass: {
       Bldr.takeNodes(Pred);
       const MaterializeTemporaryExpr *MTE = cast<MaterializeTemporaryExpr>(S);
-      CreateCXXTemporaryObject(MTE, Pred, Dst);
+      ExplodedNodeSet dstPrevisit;
+      getCheckerManager().runCheckersForPreStmt(dstPrevisit, Pred, MTE, *this);
+      ExplodedNodeSet dstExpr;
+      for (ExplodedNodeSet::iterator i = dstPrevisit.begin(),
+                                     e = dstPrevisit.end(); i != e ; ++i) {
+        CreateCXXTemporaryObject(MTE, *i, dstExpr);
+      }
+      getCheckerManager().runCheckersForPostStmt(Dst, dstExpr, MTE, *this);
       Bldr.addNodes(Dst);
       break;
     }
diff --git a/test/Analysis/Inputs/system-header-simulator-cxx.h b/test/Analysis/Inputs/system-header-simulator-cxx.h
index 04f1000dbc3f..005e7f57af6f 100644
--- a/test/Analysis/Inputs/system-header-simulator-cxx.h
+++ b/test/Analysis/Inputs/system-header-simulator-cxx.h
@@ -10,6 +10,29 @@ typedef unsigned char uint8_t;
 typedef __typeof__(sizeof(int)) size_t;
 void *memmove(void *s1, const void *s2, size_t n);
 
+template <typename T, typename Ptr, typename Ref> struct __iterator {
+  typedef __iterator<T, T *, T &> iterator;
+  typedef __iterator<T, const T *, const T &> const_iterator;
+
+  __iterator(const Ptr p) : ptr(p) {}
+
+  __iterator<T, Ptr, Ref> operator++() { return *this; }
+  __iterator<T, Ptr, Ref> operator++(int) { return *this; }
+  __iterator<T, Ptr, Ref> operator--() { return *this; }
+  __iterator<T, Ptr, Ref> operator--(int) { return *this; }
+  Ref operator*() const { return *ptr; }
+  Ptr operator->() const { return *ptr; }
+
+  bool operator==(const iterator &rhs) const { return ptr == rhs.ptr; }
+  bool operator==(const const_iterator &rhs) const { return ptr == rhs.ptr; }
+
+  bool operator!=(const iterator &rhs) const { return ptr != rhs.ptr; }
+  bool operator!=(const const_iterator &rhs) const { return ptr != rhs.ptr; }
+
+private:
+  Ptr ptr;
+};
+
 namespace std {
   template <class T1, class T2>
   struct pair {
@@ -27,6 +50,9 @@ namespace std {
   
   template<typename T>
   class vector {
+    typedef __iterator<T, T *, T &> iterator;
+    typedef __iterator<T, const T *, const T &> const_iterator;
+
     T *_start;
     T *_finish;
     T *_end_of_storage;
@@ -49,11 +75,10 @@ namespace std {
       return _start[n];
     }
     
-    T *begin() { return _start; }
-    const T *begin() const { return _start; }
-
-    T *end() { return _finish; }
-    const T *end() const { return _finish; }
+    iterator begin() { return iterator(_start); }
+    const_iterator begin() const { return const_iterator(_start); }
+    iterator end() { return iterator(_finish); }
+    const_iterator end() const { return const_iterator(_finish); }
   };
   
   class exception {
@@ -223,6 +248,35 @@ namespace std {
     return __copy_backward(II, IE, OI);
   }
 
+  template <class InputIterator, class T>
+  InputIterator find(InputIterator first, InputIterator last, const T &val);
+  template <class ForwardIterator1, class ForwardIterator2>
+  ForwardIterator1 find_end(ForwardIterator1 first1, ForwardIterator1 last1,
+                            ForwardIterator2 first2, ForwardIterator2 last2);
+  template <class ForwardIterator1, class ForwardIterator2>
+  ForwardIterator1 find_first_of(ForwardIterator1 first1,
+                                 ForwardIterator1 last1,
+                                 ForwardIterator2 first2,
+                                 ForwardIterator2 last2);
+  template <class InputIterator, class UnaryPredicate>
+  InputIterator find_if(InputIterator first, InputIterator last,
+                        UnaryPredicate pred);
+  template <class InputIterator, class UnaryPredicate>
+  InputIterator find_if_not(InputIterator first, InputIterator last,
+                            UnaryPredicate pred);
+  template <class InputIterator, class T>
+  InputIterator lower_bound(InputIterator first, InputIterator last,
+                            const T &val);
+  template <class InputIterator, class T>
+  InputIterator upper_bound(InputIterator first, InputIterator last,
+                            const T &val);
+  template <class ForwardIterator1, class ForwardIterator2>
+  ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
+                          ForwardIterator2 first2, ForwardIterator2 last2);
+  template <class ForwardIterator1, class ForwardIterator2>
+  ForwardIterator1 search_n(ForwardIterator1 first1, ForwardIterator1 last1,
+                            ForwardIterator2 first2, ForwardIterator2 last2);
+
   struct input_iterator_tag { };
   struct output_iterator_tag { };
   struct forward_iterator_tag : public input_iterator_tag { };
diff --git a/test/Analysis/diagnostics/explicit-suppression.cpp b/test/Analysis/diagnostics/explicit-suppression.cpp
index 67a47d02b6e9..d36def20f25f 100644
--- a/test/Analysis/diagnostics/explicit-suppression.cpp
+++ b/test/Analysis/diagnostics/explicit-suppression.cpp
@@ -18,6 +18,6 @@ class C {
 void testCopyNull(C *I, C *E) {
   std::copy(I, E, (C *)0);
 #ifndef SUPPRESSED
-  // expected-warning@../Inputs/system-header-simulator-cxx.h:166 {{Called C++ object pointer is null}}
+  // expected-warning@../Inputs/system-header-simulator-cxx.h:191 {{Called C++ object pointer is null}}
 #endif
 }
diff --git a/test/Analysis/inlining/stl.cpp b/test/Analysis/inlining/stl.cpp
index 95ac3f8172db..d89a10983041 100644
--- a/test/Analysis/inlining/stl.cpp
+++ b/test/Analysis/inlining/stl.cpp
@@ -6,8 +6,7 @@
 void clang_analyzer_eval(bool);
 
 void testVector(std::vector<int> &nums) {
-  if (nums.begin()) return;
-  if (nums.end()) return;
+  if (nums.begin() != nums.end()) return;
   
   clang_analyzer_eval(nums.size() == 0);
 #if INLINE
diff --git a/test/Analysis/iterator-past-end.cpp b/test/Analysis/iterator-past-end.cpp
new file mode 100644
index 000000000000..4d9ed0cf9816
--- /dev/null
+++ b/test/Analysis/iterator-past-end.cpp
@@ -0,0 +1,205 @@
+// RUN: %clang_cc1 -std=c++11 -analyze -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorPastEnd -analyzer-eagerly-assume -analyzer-config c++-container-inlining=false %s -verify
+// RUN: %clang_cc1 -std=c++11 -analyze -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorPastEnd -analyzer-eagerly-assume -analyzer-config c++-container-inlining=true -DINLINE=1 %s -verify
+
+#include "Inputs/system-header-simulator-cxx.h"
+
+void simple_good(const std::vector<int> &v) {
+  auto i = v.end();
+  if (i != v.end())
+    *i; // no-warning
+}
+
+void simple_good_negated(const std::vector<int> &v) {
+  auto i = v.end();
+  if (!(i == v.end()))
+    *i; // no-warning
+}
+
+void simple_bad(const std::vector<int> &v) {
+  auto i = v.end();
+  *i; // expected-warning{{Iterator accessed past its end}}
+}
+
+void copy(const std::vector<int> &v) {
+  auto i1 = v.end();
+  auto i2 = i1;
+  *i2; // expected-warning{{Iterator accessed past its end}}
+}
+
+void decrease(const std::vector<int> &v) {
+  auto i = v.end();
+  --i;
+  *i; // no-warning
+}
+
+void copy_and_decrease1(const std::vector<int> &v) {
+  auto i1 = v.end();
+  auto i2 = i1;
+  --i1;
+  *i1; // no-warning
+}
+
+void copy_and_decrease2(const std::vector<int> &v) {
+  auto i1 = v.end();
+  auto i2 = i1;
+  --i1;
+  *i2; // expected-warning{{Iterator accessed past its end}}
+}
+
+void copy_and_increase1(const std::vector<int> &v) {
+  auto i1 = v.begin();
+  auto i2 = i1;
+  ++i1;
+  if (i1 == v.end())
+    *i2; // no-warning
+}
+
+void copy_and_increase2(const std::vector<int> &v) {
+  auto i1 = v.begin();
+  auto i2 = i1;
+  ++i1;
+  if (i2 == v.end())
+    *i2; // expected-warning{{Iterator accessed past its end}}
+}
+
+void good_find(std::vector<int> &vec, int e) {
+  auto first = std::find(vec.begin(), vec.end(), e);
+  if (vec.end() != first)
+    *first; // no-warning
+}
+
+void bad_find(std::vector<int> &vec, int e) {
+  auto first = std::find(vec.begin(), vec.end(), e);
+  *first; // expected-warning{{Iterator accessed past its end}}
+}
+
+void good_find_end(std::vector<int> &vec, std::vector<int> &seq) {
+  auto last = std::find_end(vec.begin(), vec.end(), seq.begin(), seq.end());
+  if (vec.end() != last)
+    *last; // no-warning
+}
+
+void bad_find_end(std::vector<int> &vec, std::vector<int> &seq) {
+  auto last = std::find_end(vec.begin(), vec.end(), seq.begin(), seq.end());
+  *last; // expected-warning{{Iterator accessed past its end}}
+}
+
+void good_find_first_of(std::vector<int> &vec, std::vector<int> &seq) {
+  auto first =
+      std::find_first_of(vec.begin(), vec.end(), seq.begin(), seq.end());
+  if (vec.end() != first)
+    *first; // no-warning
+}
+
+void bad_find_first_of(std::vector<int> &vec, std::vector<int> &seq) {
+  auto first = std::find_end(vec.begin(), vec.end(), seq.begin(), seq.end());
+  *first; // expected-warning{{Iterator accessed past its end}}
+}
+
+bool odd(int i) { return i % 2; }
+
+void good_find_if(std::vector<int> &vec) {
+  auto first = std::find_if(vec.begin(), vec.end(), odd);
+  if (vec.end() != first)
+    *first; // no-warning
+}
+
+void bad_find_if(std::vector<int> &vec, int e) {
+  auto first = std::find_if(vec.begin(), vec.end(), odd);
+  *first; // expected-warning{{Iterator accessed past its end}}
+}
+
+void good_find_if_not(std::vector<int> &vec) {
+  auto first = std::find_if_not(vec.begin(), vec.end(), odd);
+  if (vec.end() != first)
+    *first; // no-warning
+}
+
+void bad_find_if_not(std::vector<int> &vec, int e) {
+  auto first = std::find_if_not(vec.begin(), vec.end(), odd);
+  *first; // expected-warning{{Iterator accessed past its end}}
+}
+
+void good_lower_bound(std::vector<int> &vec, int e) {
+  auto first = std::lower_bound(vec.begin(), vec.end(), e);
+  if (vec.end() != first)
+    *first; // no-warning
+}
+
+void bad_lower_bound(std::vector<int> &vec, int e) {
+  auto first = std::lower_bound(vec.begin(), vec.end(), e);
+  *first; // expected-warning{{Iterator accessed past its end}}
+}
+
+void good_upper_bound(std::vector<int> &vec, int e) {
+  auto last = std::lower_bound(vec.begin(), vec.end(), e);
+  if (vec.end() != last)
+    *last; // no-warning
+}
+
+void bad_upper_bound(std::vector<int> &vec, int e) {
+  auto last = std::lower_bound(vec.begin(), vec.end(), e);
+  *last; // expected-warning{{Iterator accessed past its end}}
+}
+
+void good_search(std::vector<int> &vec, std::vector<int> &seq) {
+  auto first = std::search(vec.begin(), vec.end(), seq.begin(), seq.end());
+  if (vec.end() != first)
+    *first; // no-warning
+}
+
+void bad_search(std::vector<int> &vec, std::vector<int> &seq) {
+  auto first = std::search(vec.begin(), vec.end(), seq.begin(), seq.end());
+  *first; // expected-warning{{Iterator accessed past its end}}
+}
+
+void good_search_n(std::vector<int> &vec, std::vector<int> &seq) {
+  auto nth = std::search_n(vec.begin(), vec.end(), seq.begin(), seq.end());
+  if (vec.end() != nth)
+    *nth; // no-warning
+}
+
+void bad_search_n(std::vector<int> &vec, std::vector<int> &seq) {
+  auto nth = std::search_n(vec.begin(), vec.end(), seq.begin(), seq.end());
+  *nth; // expected-warning{{Iterator accessed past its end}}
+}
+
+template <class InputIterator, class T>
+InputIterator nonStdFind(InputIterator first, InputIterator last,
+                         const T &val) {
+  for (auto i = first; i != last; ++i) {
+    if (*i == val) {
+      return i;
+    }
+  }
+  return last;
+}
+
+void good_non_std_find(std::vector<int> &vec, int e) {
+  auto first = nonStdFind(vec.begin(), vec.end(), e);
+  if (vec.end() != first)
+    *first; // no-warning
+}
+
+void bad_non_std_find(std::vector<int> &vec, int e) {
+  auto first = nonStdFind(vec.begin(), vec.end(), e);
+  *first; // expected-warning{{Iterator accessed past its end}}
+}
+
+void tricky(std::vector<int> &vec, int e) {
+  const auto first = vec.begin();
+  const auto comp1 = (first != vec.end()), comp2 = (first == vec.end());
+  if (comp1)
+    *first;
+}
+
+void loop(std::vector<int> &vec, int e) {
+  auto start = vec.begin();
+  while (true) {
+    auto item = std::find(start, vec.end(), e);
+    if (item == vec.end())
+      break;
+    *item;          // no-warning
+    start = ++item; // no-warning
+  }
+}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p15.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p15.cpp
index 3e04d5094ac1..74db2b80e70e 100644
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p15.cpp
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p15.cpp
@@ -28,14 +28,23 @@ namespace default_ctor {
   struct C;
   struct D;
 
+  struct convert_to_D1 {
+    operator D&&();
+  };
+  struct convert_to_D2 {
+    operator D&&();
+  };
+
   struct A { // expected-note 4{{candidate}}
     A(); // expected-note {{candidate}}
 
     A(C &&); // expected-note {{candidate}}
     C &operator=(C&&); // expected-note {{candidate}}
 
-    A(D &&); // expected-note {{candidate}}
+    A(D &&);
     D &operator=(D&&); // expected-note {{candidate}}
+
+    A(convert_to_D2); // expected-note {{candidate}}
   };
 
   struct B { // expected-note 4{{candidate}}
@@ -44,8 +53,10 @@ namespace default_ctor {
     B(C &&); // expected-note {{candidate}}
     C &operator=(C&&); // expected-note {{candidate}}
 
-    B(D &&); // expected-note {{candidate}}
+    B(D &&);
     D &operator=(D&&); // expected-note {{candidate}}
+
+    B(convert_to_D2); // expected-note {{candidate}}
   };
 
   struct C : A, B {
@@ -75,7 +86,20 @@ namespace default_ctor {
   // versions are inherited.
   D d; // expected-error {{ambiguous}}
   void f(D d) {
-    D d2(static_cast<D&&>(d)); // expected-error {{ambiguous}}
+    D d2(static_cast<D&&>(d)); // ok, ignores inherited constructors
+    D d3(convert_to_D1{}); // ok, ignores inherited constructors
+    D d4(convert_to_D2{}); // expected-error {{ambiguous}}
     d = static_cast<D&&>(d); // expected-error {{ambiguous}}
   }
+
+  struct Y;
+  struct X { // expected-note 2{{candidate}}
+    X();
+    X(volatile Y &); // expected-note {{constructor inherited from base class cannot be used to initialize from an argument of the derived class type}}
+  } x;
+  struct Y : X { using X::X; } volatile y; // expected-note 2{{candidate}}
+  struct Z : Y { using Y::Y; } volatile z; // expected-note 3{{candidate}} expected-note 5{{inherited here}}
+  Z z1(x); // ok
+  Z z2(y); // ok, Z is not reference-related to type of y
+  Z z3(z); // expected-error {{no match}}
 }
diff --git a/test/CXX/drs/dr13xx.cpp b/test/CXX/drs/dr13xx.cpp
index 28bebcbb607e..28e667f77f84 100644
--- a/test/CXX/drs/dr13xx.cpp
+++ b/test/CXX/drs/dr13xx.cpp
@@ -174,3 +174,133 @@ namespace dr1359 { // dr1359: 3.5
   constexpr Y y = Y(); // expected-error {{no matching}}
 #endif
 }
+
+namespace dr1388 { // dr1388: 4.0
+  template<typename A, typename ...T> void f(T..., A); // expected-note 1+{{candidate}} expected-error 0-1{{C++11}}
+  template<typename ...T> void g(T..., int); // expected-note 1+{{candidate}} expected-error 0-1{{C++11}}
+  template<typename ...T, typename A> void h(T..., A); // expected-note 1+{{candidate}} expected-error 0-1{{C++11}}
+
+  void test_f() { 
+    f(0); // ok, trailing parameter pack deduced to empty
+    f(0, 0); // expected-error {{no matching}}
+    f<int>(0);
+    f<int>(0, 0); // expected-error {{no matching}}
+    f<int, int>(0, 0);
+    f<int, int, int>(0, 0); // expected-error {{no matching}}
+
+    g(0);
+    g(0, 0); // expected-error {{no matching}}
+    g<>(0);
+    g<int>(0); // expected-error {{no matching}}
+    g<int>(0, 0);
+
+    h(0);
+    h(0, 0); // expected-error {{no matching}}
+    h<int>(0, 0);
+    h<int, int>(0, 0); // expected-error {{no matching}}
+  }
+
+  // A non-trailing parameter pack is still a non-deduced context, even though
+  // we know exactly how many arguments correspond to it.
+  template<typename T, typename U> struct pair {};
+  template<typename ...T> struct tuple { typedef char type; }; // expected-error 0-2{{C++11}}
+  template<typename ...T, typename ...U> void f_pair_1(pair<T, U>..., int); // expected-error 0-2{{C++11}} expected-note {{different lengths (2 vs. 0)}}
+  template<typename ...T, typename U> void f_pair_2(pair<T, char>..., U); // expected-error 0-2{{C++11}}
+  template<typename ...T, typename ...U> void f_pair_3(pair<T, U>..., tuple<U...>); // expected-error 0-2{{C++11}} expected-note {{different lengths (2 vs. 1)}}
+  template<typename ...T> void f_pair_4(pair<T, char>..., T...); // expected-error 0-2{{C++11}} expected-note {{<int, long> vs. <int, long, const char *>}}
+  void g(pair<int, char> a, pair<long, char> b, tuple<char, char> c) {
+    f_pair_1<int, long>(a, b, 0); // expected-error {{no match}}
+    f_pair_2<int, long>(a, b, 0);
+    f_pair_3<int, long>(a, b, c);
+    f_pair_3<int, long>(a, b, tuple<char>()); // expected-error {{no match}}
+    f_pair_4<int, long>(a, b, 0, 0L);
+    f_pair_4<int, long>(a, b, 0, 0L, "foo"); // expected-error {{no match}}
+  }
+}
+
+namespace dr1391 { // dr1391: partial
+  struct A {}; struct B : A {};
+  template<typename T> struct C { C(int); typename T::error error; }; // expected-error 2{{'::'}}
+  template<typename T> struct D {};
+
+  // No deduction is performed for parameters with no deducible template-parameters, therefore types do not need to match.
+  template<typename T> void a(T, int T::*);
+  void test_a(int A::*p) { a(A(), p); } // ok, type of second parameter does not need to match
+
+  namespace dr_example_1 {
+    template<typename T, typename U> void f(C<T>);
+    template<typename T> void f(D<T>);
+
+    void g(D<int> d) {
+      f(d); // ok, first 'f' eliminated by deduction failure
+      f<int>(d); // ok, first 'f' eliminated because 'U' cannot be deduced
+    }
+  }
+
+  namespace dr_example_2 {
+    template<typename T> typename C<T>::error f(int, T);
+    template<typename T> T f(T, T);
+
+    void g(A a) {
+      f(a, a); // ok, no conversion from A to int for first parameter of first candidate
+    }
+  }
+
+  namespace std_example {
+    template<typename T> struct Z {
+      typedef typename T::x xx;
+    };
+    template<typename T> typename Z<T>::xx f(void *, T);
+    template<typename T> void f(int, T);
+    struct A {} a;
+    void g() { f(1, a); }
+  }
+
+  template<typename T> void b(C<int> ci, T *p);
+  void b(...);
+  void test_b() {
+    b(0, 0); // ok, deduction fails prior to forming a conversion sequence and instantiating C<int>
+    // FIXME: The "while substituting" note should point at the overload candidate.
+    b<int>(0, 0); // expected-note {{instantiation of}} expected-note {{while substituting}}
+  }
+
+  template<typename T> struct Id { typedef T type; };
+  template<typename T> void c(T, typename Id<C<T> >::type);
+  void test_c() {
+    // Implicit conversion sequences for dependent types are checked later.
+    c(0.0, 0); // expected-note {{instantiation of}}
+  }
+
+  namespace partial_ordering {
+    // FIXME: Second template should be considered more specialized because non-dependent parameter is ignored.
+    template<typename T> int a(T, short) = delete; // expected-error 0-1{{extension}} expected-note {{candidate}}
+    template<typename T> int a(T*, char); // expected-note {{candidate}}
+    int test_a = a((int*)0, 0); // FIXME: expected-error {{ambiguous}}
+
+    // FIXME: Second template should be considered more specialized:
+    // deducing #1 from #2 ignores the second P/A pair, so deduction succeeds,
+    // deducing #2 from #1 fails to deduce T, so deduction fails.
+    template<typename T> int b(T, int) = delete; // expected-error 0-1{{extension}} expected-note {{candidate}}
+    template<typename T, typename U> int b(T*, U); // expected-note {{candidate}}
+    int test_b = b((int*)0, 0); // FIXME: expected-error {{ambiguous}}
+
+    // Unintended consequences: because partial ordering does not consider
+    // explicit template arguments, and deduction from a non-dependent type
+    // vacuously succeeds, a non-dependent template is less specialized than
+    // anything else!
+    // According to DR1391, this is ambiguous!
+    template<typename T> int c(int);
+    template<typename T> int c(T);
+    int test_c1 = c(0); // ok
+    int test_c2 = c<int>(0); // FIXME: apparently ambiguous
+  }
+}
+
+namespace dr1399 { // dr1399: dup 1388
+  template<typename ...T> void f(T..., int, T...) {} // expected-note {{candidate}} expected-error 0-1{{C++11}}
+  void g() {
+    f(0);
+    f<int>(0, 0, 0);
+    f(0, 0, 0); // expected-error {{no match}}
+  }
+}
diff --git a/test/CXX/drs/dr19xx.cpp b/test/CXX/drs/dr19xx.cpp
index 5b626dd80892..15ed30583fd0 100644
--- a/test/CXX/drs/dr19xx.cpp
+++ b/test/CXX/drs/dr19xx.cpp
@@ -140,7 +140,7 @@ namespace dr1959 { // dr1959: 3.9
     a() = default;
     a(const a &) = delete; // expected-note 2{{deleted}}
     a(const b &) = delete; // not inherited
-    a(c &&) = delete; // expected-note {{deleted}}
+    a(c &&) = delete;
     template<typename T> a(T) = delete;
   };
 
@@ -152,13 +152,14 @@ namespace dr1959 { // dr1959: 3.9
   b y = x; // expected-error {{deleted}}
   b z = z; // expected-error {{deleted}}
 
-  // FIXME: It's not really clear that this matches the intent, but it's
-  // consistent with the behavior for assignment operators.
   struct c : a {
     using a::a;
     c(const c &);
   };
-  c q(static_cast<c&&>(q)); // expected-error {{call to deleted}}
+  // FIXME: As a resolution to an open DR against P0136R0, we disallow
+  // use of inherited constructors to construct from a single argument
+  // where the derived class is reference-related to its type.
+  c q(static_cast<c&&>(q));
 #endif
 }
 
diff --git a/test/CXX/expr/expr.prim/expr.prim.lambda/templates.cpp b/test/CXX/expr/expr.prim/expr.prim.lambda/templates.cpp
index e40761770d55..31213c9ebc33 100644
--- a/test/CXX/expr/expr.prim/expr.prim.lambda/templates.cpp
+++ b/test/CXX/expr/expr.prim/expr.prim.lambda/templates.cpp
@@ -139,11 +139,11 @@ namespace NonLocalLambdaInstantation {
   }
 
   template<typename T>
-  struct X2 { // expected-note{{in instantiation of default member initializer 'NonLocalLambdaInstantation::X2<int *>::x' requested here}}
+  struct X2 {
     int x = []{ return T(); }(); // expected-error{{cannot initialize a member subobject of type 'int' with an rvalue of type 'int *'}}
   };
 
   X2<int> x2i;
   X2<float> x2f;
-  X2<int*> x2ip; // expected-note{{implicit default constructor for 'NonLocalLambdaInstantation::X2<int *>' first required here}}
+  X2<int*> x2ip; // expected-note{{in instantiation of default member initializer 'NonLocalLambdaInstantation::X2<int *>::x'}}
 }
diff --git a/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p1-0x.cpp b/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p1-0x.cpp
index cd1d9f15c725..081bba2b8dff 100644
--- a/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p1-0x.cpp
+++ b/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p1-0x.cpp
@@ -76,14 +76,17 @@ void test_pair_deduction(int *ip, float *fp, double *dp) {
   first_arg_pair(make_pair(ip, 17), 16); // expected-error{{no matching function for call to 'first_arg_pair'}}
 }
 
-// For a function parameter pack that does not occur at the end of the
-// parameter-declaration-list, the type of the parameter pack is a
-// non-deduced context.
+// A function parameter pack not at the end of the parameter list is never
+// deduced. We interpret this as meaning the types within it are never
+// deduced, and thus must match explicitly-specified values.
 template<typename ...Types> struct tuple { };
 
 template<typename ...Types>
-void pack_not_at_end(tuple<Types...>, Types... values, int);
+void pack_not_at_end(tuple<Types...>, Types... values, int); // expected-note {{<int *, double *> vs. <>}}
 
 void test_pack_not_at_end(tuple<int*, double*> t2) {
-  pack_not_at_end(t2, 0, 0, 0);
+  pack_not_at_end(t2, 0, 0, 0); // expected-error {{no match}}
+  // FIXME: Should the "original argument type must match deduced parameter
+  // type" rule apply here?
+  pack_not_at_end<int*, double*>(t2, 0, 0, 0); // ok
 }
diff --git a/test/CXX/temp/temp.param/p5.cpp b/test/CXX/temp/temp.param/p5.cpp
index ab430fb8741f..aa0d7e92b9ec 100644
--- a/test/CXX/temp/temp.param/p5.cpp
+++ b/test/CXX/temp/temp.param/p5.cpp
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -verify %s -std=c++11
+// RUN: %clang_cc1 -verify %s -std=c++14
 
-template<const int I> struct S { // expected-note {{instantiation}}
+template<const int I> struct S {
   decltype(I) n;
   int &&r = I; // expected-warning 2{{binding reference member 'r' to a temporary value}} expected-note 2{{declared here}}
 };
-S<5> s;
+S<5> s; // expected-note {{instantiation}}
 
-template<typename T, T v> struct U { // expected-note {{instantiation}}
+template<typename T, T v> struct U {
   decltype(v) n;
   int &&r = v; // expected-warning {{binding reference member 'r' to a temporary value}} expected-note {{declared here}}
 };
-U<const int, 6> u;
+U<const int, 6> u; // expected-note {{instantiation}}
diff --git a/test/CodeGen/lifetime2.c b/test/CodeGen/lifetime2.c
index 0d22282fdd43..4374b3c279c7 100644
--- a/test/CodeGen/lifetime2.c
+++ b/test/CodeGen/lifetime2.c
@@ -1,4 +1,6 @@
 // RUN: %clang -S -emit-llvm -o - -O2 %s | FileCheck %s -check-prefixes=CHECK,O2
+// RUN: %clang -S -emit-llvm -o - -O2 -Xclang -disable-lifetime-markers %s \
+// RUN:       | FileCheck %s -check-prefixes=CHECK,O0
 // RUN: %clang -S -emit-llvm -o - -O0 %s | FileCheck %s -check-prefixes=CHECK,O0
 
 extern int bar(char *A, int n);
diff --git a/test/CodeGen/thinlto_backend.ll b/test/CodeGen/thinlto_backend.ll
index 89f4fc407fbc..ac0b3b76ef7d 100644
--- a/test/CodeGen/thinlto_backend.ll
+++ b/test/CodeGen/thinlto_backend.ll
@@ -12,6 +12,14 @@
 ; RUN: %clang -O2 -o %t4.o -x ir %t1.o -c -fthinlto-index=bad.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR1
 ; CHECK-ERROR1: Error loading index file 'bad.thinlto.bc'
 
+; Ensure we ignore empty index file under -ignore-empty-index-file, and run
+; non-ThinLTO compilation which would not import f2
+; RUN: touch %t4.thinlto.bc
+; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t4.o -x ir %t1.o -c -fthinlto-index=%t4.thinlto.bc -mllvm -ignore-empty-index-file
+; RUN: llvm-nm %t4.o | FileCheck --check-prefix=CHECK-OBJ-IGNORE-EMPTY %s
+; CHECK-OBJ-IGNORE-EMPTY: T f1
+; CHECK-OBJ-IGNORE-EMPTY: U f2
+
 ; Ensure f2 was imported
 ; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=%t.thinlto.bc
 ; RUN: llvm-nm %t3.o | FileCheck --check-prefix=CHECK-OBJ %s
diff --git a/test/CodeGenCXX/arm.cpp b/test/CodeGenCXX/arm.cpp
index d0b896d182da..7eba017e37f2 100644
--- a/test/CodeGenCXX/arm.cpp
+++ b/test/CodeGenCXX/arm.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 %s -triple=thumbv7-apple-ios6.0 -fno-use-cxa-atexit -target-abi apcs-gnu -emit-llvm -o - -fexceptions | FileCheck %s
+// RUN: %clang_cc1 %s -triple=thumbv7-apple-ios6.0 -fno-use-cxa-atexit -target-abi apcs-gnu -emit-llvm -std=gnu++98 -o - -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK98 %s
+// RUN: %clang_cc1 %s -triple=thumbv7-apple-ios6.0 -fno-use-cxa-atexit -target-abi apcs-gnu -emit-llvm -std=gnu++11 -o - -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK11 %s
 
 // CHECK: @_ZZN5test74testEvE1x = internal global i32 0, align 4
 // CHECK: @_ZGVZN5test74testEvE1x = internal global i32 0
@@ -156,7 +157,8 @@ namespace test3 {
     // CHECK: getelementptr {{.*}}, i32 4
     // CHECK: bitcast {{.*}} to i32*
     // CHECK: load
-    // CHECK: invoke {{.*}} @_ZN5test31AD1Ev
+    // CHECK98: invoke {{.*}} @_ZN5test31AD1Ev
+    // CHECK11: call {{.*}} @_ZN5test31AD1Ev
     // CHECK: call void @_ZdaPv
     delete [] x;
   }
@@ -168,7 +170,8 @@ namespace test3 {
     // CHECK: getelementptr {{.*}}, i32 4
     // CHECK: bitcast {{.*}} to i32*
     // CHECK: load
-    // CHECK: invoke {{.*}} @_ZN5test31AD1Ev
+    // CHECK98: invoke {{.*}} @_ZN5test31AD1Ev
+    // CHECK11: call {{.*}} @_ZN5test31AD1Ev
     // CHECK: call void @_ZdaPv
     delete [] x;
   }
diff --git a/test/CodeGenCXX/debug-info-class.cpp b/test/CodeGenCXX/debug-info-class.cpp
index d572eef68abf..e06ef8dfda5b 100644
--- a/test/CodeGenCXX/debug-info-class.cpp
+++ b/test/CodeGenCXX/debug-info-class.cpp
@@ -83,12 +83,17 @@ int main(int argc, char **argv) {
   return 0;
 }
 
-// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions %s -o - | FileCheck %s
-// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions %s -o - | FileCheck %s
-// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 %s
+// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 %s
+// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 %s
+// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 %s
+// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 %s
+// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 %s
+
+// CHECK98: invoke {{.+}} @_ZN1BD1Ev(%class.B* %b)
+// CHECK98-NEXT: unwind label %{{.+}}, !dbg ![[EXCEPTLOC:.*]]
+// CHECK11: call {{.+}} @_ZN1BD1Ev(%class.B* %b){{.*}}, !dbg ![[EXCEPTLOC:.*]]
 
-// CHECK: invoke {{.+}} @_ZN1BD1Ev(%class.B* %b)
-// CHECK-NEXT: unwind label %{{.+}}, !dbg ![[EXCEPTLOC:.*]]
 // CHECK: store i32 0, i32* %{{.+}}, !dbg ![[RETLOC:.*]]
 
 // CHECK: [[F:![0-9]*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "F"
diff --git a/test/CodeGenCXX/dllexport-ctor-closure.cpp b/test/CodeGenCXX/dllexport-ctor-closure.cpp
new file mode 100644
index 000000000000..4fae7e10e8b6
--- /dev/null
+++ b/test/CodeGenCXX/dllexport-ctor-closure.cpp
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -triple i686-windows-msvc -emit-llvm -std=c++14 \
+// RUN:    -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases \
+// RUN:    -disable-llvm-passes -o - %s -w -fms-compatibility-version=19.00 | \
+// RUN:    FileCheck %s
+
+struct CtorWithClosure {
+  __declspec(dllexport) CtorWithClosure(...) {}
+// CHECK-LABEL: define weak_odr dllexport x86_thiscallcc void @"\01??_FCtorWithClosure@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK:   %[[this_addr:.*]] = alloca %struct.CtorWithClosure*, align 4
+// CHECK:   store %struct.CtorWithClosure* %this, %struct.CtorWithClosure** %[[this_addr]], align 4
+// CHECK:   %[[this:.*]] = load %struct.CtorWithClosure*, %struct.CtorWithClosure** %[[this_addr]]
+// CHECK:   call %struct.CtorWithClosure* (%struct.CtorWithClosure*, ...) @"\01??0CtorWithClosure@@QAA@ZZ"(%struct.CtorWithClosure* %[[this]])
+// CHECK:   ret void
+};
+
+struct CtorWithClosureOutOfLine {
+  __declspec(dllexport) CtorWithClosureOutOfLine(...);
+};
+CtorWithClosureOutOfLine::CtorWithClosureOutOfLine(...) {}
+// CHECK-LABEL: define weak_odr dllexport x86_thiscallcc void @"\01??_FCtorWithClosureOutOfLine@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+
+#define DELETE_IMPLICIT_MEMBERS(ClassName) \
+    ClassName(ClassName &&) = delete; \
+    ClassName(ClassName &) = delete; \
+    ~ClassName() = delete; \
+    ClassName &operator=(ClassName &) = delete
+
+struct __declspec(dllexport) ClassWithClosure {
+  DELETE_IMPLICIT_MEMBERS(ClassWithClosure);
+  ClassWithClosure(...) {}
+// CHECK-LABEL: define weak_odr dllexport x86_thiscallcc void @"\01??_FClassWithClosure@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK:   %[[this_addr:.*]] = alloca %struct.ClassWithClosure*, align 4
+// CHECK:   store %struct.ClassWithClosure* %this, %struct.ClassWithClosure** %[[this_addr]], align 4
+// CHECK:   %[[this:.*]] = load %struct.ClassWithClosure*, %struct.ClassWithClosure** %[[this_addr]]
+// CHECK:   call %struct.ClassWithClosure* (%struct.ClassWithClosure*, ...) @"\01??0ClassWithClosure@@QAA@ZZ"(%struct.ClassWithClosure* %[[this]])
+// CHECK:   ret void
+};
+
+template <typename T> struct TemplateWithClosure {
+  TemplateWithClosure(int x = sizeof(T)) {}
+};
+extern template struct TemplateWithClosure<char>;
+template struct __declspec(dllexport) TemplateWithClosure<char>;
+extern template struct TemplateWithClosure<int>;
+template struct __declspec(dllexport) TemplateWithClosure<int>;
+
+// CHECK-LABEL: define weak_odr dllexport x86_thiscallcc void @"\01??_F?$TemplateWithClosure@D@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK:   call {{.*}} @"\01??0?$TemplateWithClosure@D@@QAE@H@Z"({{.*}}, i32 1)
+
+// CHECK-LABEL: define weak_odr dllexport x86_thiscallcc void @"\01??_F?$TemplateWithClosure@H@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK:   call {{.*}} @"\01??0?$TemplateWithClosure@H@@QAE@H@Z"({{.*}}, i32 4)
+
+struct __declspec(dllexport) NestedOuter {
+  DELETE_IMPLICIT_MEMBERS(NestedOuter);
+  NestedOuter(void *p = 0) {}
+  struct __declspec(dllexport) NestedInner {
+    DELETE_IMPLICIT_MEMBERS(NestedInner);
+    NestedInner(void *p = 0) {}
+  };
+};
+
+// CHECK-LABEL: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedOuter@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK-LABEL: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedInner@NestedOuter@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+
+struct HasDtor {
+  ~HasDtor();
+  int o;
+};
+struct HasImplicitDtor1 { HasDtor o; };
+struct HasImplicitDtor2 { HasDtor o; };
+struct __declspec(dllexport) CtorClosureInline {
+  CtorClosureInline(const HasImplicitDtor1 &v = {}) {}
+};
+struct __declspec(dllexport) CtorClosureOutOfLine {
+  CtorClosureOutOfLine(const HasImplicitDtor2 &v = {});
+};
+CtorClosureOutOfLine::CtorClosureOutOfLine(const HasImplicitDtor2 &v) {}
+
+// CHECK-LABEL: define weak_odr dllexport x86_thiscallcc void @"\01??_FCtorClosureInline@@QAEXXZ"
+// CHECK-LABEL: define linkonce_odr x86_thiscallcc void @"\01??1HasImplicitDtor1@@QAE@XZ"
+// CHECK-LABEL: define weak_odr dllexport x86_thiscallcc void @"\01??_FCtorClosureOutOfLine@@QAEXXZ"
+// CHECK-LABEL: define linkonce_odr x86_thiscallcc void @"\01??1HasImplicitDtor2@@QAE@XZ"
diff --git a/test/CodeGenCXX/dllexport.cpp b/test/CodeGenCXX/dllexport.cpp
index 116176e2cb92..fe40bc0aac12 100644
--- a/test/CodeGenCXX/dllexport.cpp
+++ b/test/CodeGenCXX/dllexport.cpp
@@ -488,57 +488,6 @@ struct S {
   };
 };
 
-struct CtorWithClosure {
-  __declspec(dllexport) CtorWithClosure(...) {}
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FCtorWithClosure@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
-// M32-DAG:   %[[this_addr:.*]] = alloca %struct.CtorWithClosure*, align 4
-// M32-DAG:   store %struct.CtorWithClosure* %this, %struct.CtorWithClosure** %[[this_addr]], align 4
-// M32-DAG:   %[[this:.*]] = load %struct.CtorWithClosure*, %struct.CtorWithClosure** %[[this_addr]]
-// M32-DAG:   call %struct.CtorWithClosure* (%struct.CtorWithClosure*, ...) @"\01??0CtorWithClosure@@QAA@ZZ"(%struct.CtorWithClosure* %[[this]])
-// M32-DAG:   ret void
-};
-
-#define DELETE_IMPLICIT_MEMBERS(ClassName) \
-    ClassName(ClassName &&) = delete; \
-    ClassName(ClassName &) = delete; \
-    ~ClassName() = delete; \
-    ClassName &operator=(ClassName &) = delete
-
-struct __declspec(dllexport) ClassWithClosure {
-  DELETE_IMPLICIT_MEMBERS(ClassWithClosure);
-  ClassWithClosure(...) {}
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FClassWithClosure@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
-// M32-DAG:   %[[this_addr:.*]] = alloca %struct.ClassWithClosure*, align 4
-// M32-DAG:   store %struct.ClassWithClosure* %this, %struct.ClassWithClosure** %[[this_addr]], align 4
-// M32-DAG:   %[[this:.*]] = load %struct.ClassWithClosure*, %struct.ClassWithClosure** %[[this_addr]]
-// M32-DAG:   call %struct.ClassWithClosure* (%struct.ClassWithClosure*, ...) @"\01??0ClassWithClosure@@QAA@ZZ"(%struct.ClassWithClosure* %[[this]])
-// M32-DAG:   ret void
-};
-
-template <typename T> struct TemplateWithClosure {
-  TemplateWithClosure(int x = sizeof(T)) {}
-};
-extern template struct TemplateWithClosure<char>;
-template struct __declspec(dllexport) TemplateWithClosure<char>;
-extern template struct TemplateWithClosure<int>;
-template struct __declspec(dllexport) TemplateWithClosure<int>;
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_F?$TemplateWithClosure@D@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
-// M32-DAG:   call {{.*}} @"\01??0?$TemplateWithClosure@D@@QAE@H@Z"({{.*}}, i32 1)
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_F?$TemplateWithClosure@H@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
-// M32-DAG:   call {{.*}} @"\01??0?$TemplateWithClosure@H@@QAE@H@Z"({{.*}}, i32 4)
-
-struct __declspec(dllexport) NestedOuter {
-  DELETE_IMPLICIT_MEMBERS(NestedOuter);
-  NestedOuter(void *p = 0) {}
-  struct __declspec(dllexport) NestedInner {
-    DELETE_IMPLICIT_MEMBERS(NestedInner);
-    NestedInner(void *p = 0) {}
-  };
-};
-
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedOuter@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedInner@NestedOuter@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
-
 template <typename T>
 struct SomeTemplate {
   SomeTemplate(T o = T()) : o(o) {}
diff --git a/test/CodeGenCXX/eh-aggregate-copy-destroy.cpp b/test/CodeGenCXX/eh-aggregate-copy-destroy.cpp
index 29fb5567fb1d..9f0f36c87561 100644
--- a/test/CodeGenCXX/eh-aggregate-copy-destroy.cpp
+++ b/test/CodeGenCXX/eh-aggregate-copy-destroy.cpp
@@ -1,7 +1,8 @@
 // Check that in case of copying an array of memcpy-able objects, their
 // destructors will be called if an exception is thrown.
 //
-// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fexceptions -fcxx-exceptions -O0 -fno-elide-constructors -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fexceptions -fcxx-exceptions -O0 -fno-elide-constructors -std=c++98 -emit-llvm %s -o - | FileCheck -check-prefix=CHECK -check-prefix=CHECK98 %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fexceptions -fcxx-exceptions -O0 -fno-elide-constructors -std=c++11 -emit-llvm %s -o - | FileCheck -check-prefix=CHECK -check-prefix=CHECK11 %s
 
 struct ImplicitCopy {
   int x;
@@ -25,7 +26,8 @@ int main () {
     // CHECK_LABEL: main
     // CHECK-NOT: call void @_ZN9ThrowCopyC1ERKS_
     // CHECK: invoke void @_ZN9ThrowCopyC1ERKS_
-    // CHECK: invoke void @_ZN12ImplicitCopyD1Ev
+    // CHECK98: invoke void @_ZN12ImplicitCopyD1Ev
+    // CHECK11: call void @_ZN12ImplicitCopyD1Ev
     Container c2(c1);
   }
   catch (...) {
diff --git a/test/CodeGenCXX/exceptions.cpp b/test/CodeGenCXX/exceptions.cpp
index 86616d1e2c62..e31d6fc2797b 100644
--- a/test/CodeGenCXX/exceptions.cpp
+++ b/test/CodeGenCXX/exceptions.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -o - -fcxx-exceptions -fexceptions | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -std=c++98 -o - -fcxx-exceptions -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK98 %s
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -std=c++11 -o - -fcxx-exceptions -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK11 %s
 
 typedef __typeof(sizeof(0)) size_t;
 
@@ -64,7 +65,10 @@ namespace test1 {
     // CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[T1]], align 4
     // CHECK-NEXT: invoke void @_ZN5test11AC1Ei([[A]]* [[CAST]], i32 [[T2]])
     // CHECK:      store i1 false, i1* [[ACTIVE]]
-    // CHECK-NEXT: invoke void @_ZN5test11BD1Ev([[B]]* [[T0]])
+
+    // CHECK98-NEXT: invoke void @_ZN5test11BD1Ev([[B]]* [[T0]])
+    // CHECK11-NEXT: call void @_ZN5test11BD1Ev([[B]]* [[T0]])
+
     // CHECK:      ret [[A]]* [[CAST]]
     // CHECK:      [[ISACTIVE:%.*]] = load i1, i1* [[ACTIVE]]
     // CHECK-NEXT: br i1 [[ISACTIVE]]
@@ -74,10 +78,10 @@ namespace test1 {
 
   //   rdar://11904428
   //   Terminate landing pads should call __cxa_begin_catch first.
-  // CHECK:      define linkonce_odr hidden void @__clang_call_terminate(i8*) [[NI_NR_NUW:#[0-9]+]] comdat
-  // CHECK-NEXT:   [[T0:%.*]] = call i8* @__cxa_begin_catch(i8* %0) [[NUW:#[0-9]+]]
-  // CHECK-NEXT:   call void @_ZSt9terminatev() [[NR_NUW:#[0-9]+]]
-  // CHECK-NEXT:   unreachable
+  // CHECK98:      define linkonce_odr hidden void @__clang_call_terminate(i8*) [[NI_NR_NUW:#[0-9]+]] comdat
+  // CHECK98-NEXT:   [[T0:%.*]] = call i8* @__cxa_begin_catch(i8* %0) [[NUW:#[0-9]+]]
+  // CHECK98-NEXT:   call void @_ZSt9terminatev() [[NR_NUW:#[0-9]+]]
+  // CHECK98-NEXT:   unreachable
 
   A *d() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11dEv()
@@ -89,7 +93,10 @@ namespace test1 {
     // CHECK:      [[T1:%.*]] = invoke i32 @_ZN5test11BcviEv([[B]]* [[T0]])
     // CHECK:      invoke void @_ZN5test11AC1Ei([[A]]* [[CAST]], i32 [[T1]])
     // CHECK:      store i1 false, i1* [[ACTIVE]]
-    // CHECK-NEXT: invoke void @_ZN5test11BD1Ev([[B]]* [[T0]])
+
+    // CHECK98-NEXT: invoke void @_ZN5test11BD1Ev([[B]]* [[T0]])
+    // CHECK11-NEXT: call void @_ZN5test11BD1Ev([[B]]* [[T0]])
+
     // CHECK:      ret [[A]]* [[CAST]]
     // CHECK:      [[ISACTIVE:%.*]] = load i1, i1* [[ACTIVE]]
     // CHECK-NEXT: br i1 [[ISACTIVE]]
@@ -109,8 +116,13 @@ namespace test1 {
     // CHECK:      [[T3:%.*]] = invoke i32 @_ZN5test11BcviEv([[B]]* [[T2]])
     // CHECK:      invoke void @_ZN5test11AC1Eii([[A]]* [[CAST]], i32 [[T1]], i32 [[T3]])
     // CHECK:      store i1 false, i1* [[ACTIVE]]
-    // CHECK-NEXT: invoke void @_ZN5test11BD1Ev([[B]]* [[T2]])
-    // CHECK:      invoke void @_ZN5test11BD1Ev([[B]]* [[T0]])
+
+    // CHECK98-NEXT: invoke void @_ZN5test11BD1Ev([[B]]* [[T2]])
+    // CHECK11-NEXT: call void @_ZN5test11BD1Ev([[B]]* [[T2]])
+
+    // CHECK98:      invoke void @_ZN5test11BD1Ev([[B]]* [[T0]])
+    // CHECK11:      call void @_ZN5test11BD1Ev([[B]]* [[T0]])
+
     // CHECK:      ret [[A]]* [[CAST]]
     // CHECK:      [[ISACTIVE:%.*]] = load i1, i1* [[ACTIVE]]
     // CHECK-NEXT: br i1 [[ISACTIVE]]
@@ -141,8 +153,13 @@ namespace test1 {
     // CHECK-NEXT: store [[A]]* [[CAST]], [[A]]** [[X]], align 8
     // CHECK:      invoke void @_ZN5test15makeBEv([[B:%.*]]* sret [[T2:%.*]])
     // CHECK:      [[RET:%.*]] = load [[A]]*, [[A]]** [[X]], align 8
-    // CHECK:      invoke void @_ZN5test11BD1Ev([[B]]* [[T2]])
-    // CHECK:      invoke void @_ZN5test11BD1Ev([[B]]* [[T0]])
+
+    // CHECK98:      invoke void @_ZN5test11BD1Ev([[B]]* [[T2]])
+    // CHECK11:      call void @_ZN5test11BD1Ev([[B]]* [[T2]])
+
+    // CHECK98:      invoke void @_ZN5test11BD1Ev([[B]]* [[T0]])
+    // CHECK11:      call void @_ZN5test11BD1Ev([[B]]* [[T0]])
+
     // CHECK:      ret [[A]]* [[RET]]
     // CHECK:      [[ISACTIVE:%.*]] = load i1, i1* [[ACTIVE]]
     // CHECK-NEXT: br i1 [[ISACTIVE]]
@@ -166,8 +183,11 @@ namespace test2 {
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test21AC1Ei([[A]]* [[CAST]], i32 5)
     // CHECK:      ret [[A]]* [[CAST]]
-    // CHECK:      invoke void @_ZN5test21AdlEPvm(i8* [[NEW]], i64 8)
-    // CHECK:      call void @__clang_call_terminate(i8* {{%.*}}) [[NR_NUW]]
+
+    // CHECK98:      invoke void @_ZN5test21AdlEPvm(i8* [[NEW]], i64 8)
+    // CHECK11:      call void @_ZN5test21AdlEPvm(i8* [[NEW]], i64 8)
+
+    // CHECK98:      call void @__clang_call_terminate(i8* {{%.*}}) [[NR_NUW]]
     return new A(5);
   }
 }
@@ -192,8 +212,11 @@ namespace test3 {
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test31AC1Ei([[A]]* [[CAST]], i32 5)
     // CHECK:      ret [[A]]* [[CAST]]
-    // CHECK:      invoke void @_ZN5test31AdlEPvS1_d(i8* [[NEW]], i8* [[FOO]], double [[BAR]])
-    // CHECK:      call void @__clang_call_terminate(i8* {{%.*}}) [[NR_NUW]]
+
+    // CHECK98:      invoke void @_ZN5test31AdlEPvS1_d(i8* [[NEW]], i8* [[FOO]], double [[BAR]])
+    // CHECK11:      call void @_ZN5test31AdlEPvS1_d(i8* [[NEW]], i8* [[FOO]], double [[BAR]])
+
+    // CHECK98:      call void @__clang_call_terminate(i8* {{%.*}}) [[NR_NUW]]
     return new(foo(),bar()) A(5);
   }
 
@@ -235,7 +258,9 @@ namespace test3 {
     // CHECK-NEXT: br i1 [[ISACTIVE]]
     // CHECK:      [[V0:%.*]] = load i8*, i8** [[SAVED0]]
     // CHECK-NEXT: [[V1:%.*]] = load i8*, i8** [[SAVED1]]
-    // CHECK-NEXT: invoke void @_ZN5test31AdlEPvS1_d(i8* [[V0]], i8* [[V1]], double [[CONST]])
+
+    // CHECK98-NEXT: invoke void @_ZN5test31AdlEPvS1_d(i8* [[V0]], i8* [[V1]], double [[CONST]])
+    // CHECK11-NEXT: call void @_ZN5test31AdlEPvS1_d(i8* [[V0]], i8* [[V1]], double [[CONST]])
   }
 }
 
@@ -283,9 +308,13 @@ namespace test5 {
   // CHECK-NEXT: [[SRC:%.*]] = bitcast i8* [[ADJ]] to [[A_T]]*
   // CHECK-NEXT: invoke void @_ZN5test51TC1Ev([[T_T]]* [[T]])
   // CHECK:      invoke void @_ZN5test51AC1ERKS0_RKNS_1TE([[A_T]]* [[A]], [[A_T]]* dereferenceable({{[0-9]+}}) [[SRC]], [[T_T]]* dereferenceable({{[0-9]+}}) [[T]])
-  // CHECK:      invoke void @_ZN5test51TD1Ev([[T_T]]* [[T]])
-  // CHECK:      call i8* @__cxa_begin_catch(i8* [[EXN]]) [[NUW]]
-  // CHECK-NEXT: invoke void @_ZN5test51AD1Ev([[A_T]]* [[A]])
+
+  // CHECK98:      invoke void @_ZN5test51TD1Ev([[T_T]]* [[T]])
+  // CHECK11:      call void @_ZN5test51TD1Ev([[T_T]]* [[T]])
+
+  // CHECK98:      call i8* @__cxa_begin_catch(i8* [[EXN]]) [[NUW]]
+  // CHECK98-NEXT: invoke void @_ZN5test51AD1Ev([[A_T]]* [[A]])
+
   // CHECK:      call void @__cxa_end_catch()
   void test() {
     try {
@@ -380,12 +409,16 @@ namespace test7 {
     // Destroy the inner A object.
     // CHECK-NEXT: load i1, i1* [[INNER_A]]
     // CHECK-NEXT: br i1
-    // CHECK:      invoke void @_ZN5test71AD1Ev(
+
+    // CHECK98:    invoke void @_ZN5test71AD1Ev(
+    // CHECK11:    call void @_ZN5test71AD1Ev(
 
     // Destroy the outer A object.
     // CHECK:      load i1, i1* [[OUTER_A]]
     // CHECK-NEXT: br i1
-    // CHECK:      invoke void @_ZN5test71AD1Ev(
+
+    // CHECK98:    invoke void @_ZN5test71AD1Ev(
+    // CHECK11:    call void @_ZN5test71AD1Ev(
 
     return new B(A(), new B(A(), 0));
   }
@@ -456,8 +489,12 @@ namespace test10 {
   // CHECK-NEXT: load i8, i8* @_ZN6test108suppressE, align 1
   // CHECK-NEXT: trunc
   // CHECK-NEXT: br i1
-  // CHECK:      call void @__cxa_end_catch()
-  // CHECK-NEXT: br label
+
+  // CHECK98:      call void @__cxa_end_catch()
+  // CHECK98-NEXT: br label
+  // CHECK11:      invoke void @__cxa_end_catch()
+  // CHECK11-NEXT: to label
+
   // CHECK:      invoke void @__cxa_rethrow()
   // CHECK:      unreachable
 }
@@ -504,7 +541,10 @@ namespace test11 {
   // CHECK-NEXT: br i1 [[EMPTY]]
   // CHECK:      [[AFTER:%.*]] = phi [[A]]* [ [[CUR]], {{%.*}} ], [ [[ELT:%.*]], {{%.*}} ]
   // CHECK-NEXT: [[ELT]] = getelementptr inbounds [[A]], [[A]]* [[AFTER]], i64 -1
-  // CHECK-NEXT: invoke void @_ZN6test111AD1Ev([[A]]* [[ELT]])
+
+  // CHECK98-NEXT: invoke void @_ZN6test111AD1Ev([[A]]* [[ELT]])
+  // CHECK11-NEXT: call void @_ZN6test111AD1Ev([[A]]* [[ELT]])
+
   // CHECK:      [[DONE:%.*]] = icmp eq [[A]]* [[ELT]], [[ARRAYBEGIN]]
   // CHECK-NEXT: br i1 [[DONE]],
   //     - Next, chain to cleanup for single.
@@ -517,13 +557,19 @@ namespace test11 {
   // CHECK-NEXT: br label
   // CHECK:      [[AFTER:%.*]] = phi [[A]]* [ [[ARRAYEND]], {{%.*}} ], [ [[ELT:%.*]], {{%.*}} ]
   // CHECK-NEXT: [[ELT]] = getelementptr inbounds [[A]], [[A]]* [[AFTER]], i64 -1
-  // CHECK-NEXT: invoke void @_ZN6test111AD1Ev([[A]]* [[ELT]])
+
+  // CHECK98-NEXT: invoke void @_ZN6test111AD1Ev([[A]]* [[ELT]])
+  // CHECK11-NEXT: call void @_ZN6test111AD1Ev([[A]]* [[ELT]])
+
   // CHECK:      [[DONE:%.*]] = icmp eq [[A]]* [[ELT]], [[ARRAYBEGIN]]
   // CHECK-NEXT: br i1 [[DONE]],
   //     - Next, chain to cleanup for single.
   // CHECK:      br label
   //   Finally, the cleanup for single.
-  // CHECK:      invoke void @_ZN6test111AD1Ev([[A]]* [[SINGLE]])
+
+  // CHECK98:      invoke void @_ZN6test111AD1Ev([[A]]* [[SINGLE]])
+  // CHECK11:      call void @_ZN6test111AD1Ev([[A]]* [[SINGLE]])
+
   // CHECK:      br label
   // CHECK:      resume
   //   (After this is a terminate landingpad.)
@@ -543,7 +589,9 @@ namespace test12 {
   // CHECK-NEXT:  [[CAST:%.*]] = bitcast i8* [[PTR]] to [[A:%.*]]*
   // CHECK-NEXT:  invoke void @_ZN6test121AC1Ev([[A]]* [[CAST]])
   // CHECK:       ret [[A]]* [[CAST]]
-  // CHECK:       invoke void @_ZN6test121AdlEPvS1_(i8* [[PTR]], i8* [[PTR]])
+
+  // CHECK98:       invoke void @_ZN6test121AdlEPvS1_(i8* [[PTR]], i8* [[PTR]])
+  // CHECK11:       call void @_ZN6test121AdlEPvS1_(i8* [[PTR]], i8* [[PTR]])
 }
 
-// CHECK: attributes [[NI_NR_NUW]] = { noinline noreturn nounwind }
+// CHECK98: attributes [[NI_NR_NUW]] = { noinline noreturn nounwind }
diff --git a/test/CodeGenCXX/goto.cpp b/test/CodeGenCXX/goto.cpp
index 27bd7affbac9..2f5b7197ca0e 100644
--- a/test/CodeGenCXX/goto.cpp
+++ b/test/CodeGenCXX/goto.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin10 -fcxx-exceptions -fexceptions -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin10 -fcxx-exceptions -fexceptions -emit-llvm -std=c++98 -o - | FileCheck -check-prefix=CHECK -check-prefix=CHECK98 %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin10 -fcxx-exceptions -fexceptions -emit-llvm -std=c++11 -o - | FileCheck -check-prefix=CHECK -check-prefix=CHECK11 %s
 
 // Reduced from a crash on boost::interprocess's node_allocator_test.cpp.
 namespace test0 {
@@ -24,7 +25,9 @@ namespace test0 {
     // CHECK-NEXT: invoke void @_ZN5test01AC1Ev([[A]]* [[TMP]])
     // CHECK:      invoke void @_ZN5test01VC1ERKNS_1AE([[V]]* [[NEWCAST]], [[A]]* dereferenceable({{[0-9]+}}) [[TMP]])
     // CHECK:      store i1 false, i1* [[CLEANUPACTIVE]]
-    // CHECK-NEXT: invoke void @_ZN5test01AD1Ev([[A]]* [[TMP]])
+
+    // CHECK98-NEXT: invoke void @_ZN5test01AD1Ev([[A]]* [[TMP]])
+    // CHECK11-NEXT: call void @_ZN5test01AD1Ev([[A]]* [[TMP]])
     A y;
     try {
       A z;
diff --git a/test/Driver/B-opt.c b/test/Driver/B-opt.c
index 318009413b1c..51273fd7b826 100644
--- a/test/Driver/B-opt.c
+++ b/test/Driver/B-opt.c
@@ -1,22 +1,22 @@
 // Check -B driver option.
 //
 // RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
-// RUN:     -B %S/Inputs/B_opt_tree/dir1 2>&1 \
+// RUN:     -B %S/Inputs/B_opt_tree/dir1 -fuse-ld=ld 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-B-OPT-TRIPLE %s
 // CHECK-B-OPT-TRIPLE: "{{.*}}/Inputs/B_opt_tree/dir1{{/|\\\\}}i386-unknown-linux-ld"
 //
 // RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
-// RUN:     -B %S/Inputs/B_opt_tree/dir2 2>&1 \
+// RUN:     -B %S/Inputs/B_opt_tree/dir2 -fuse-ld=ld 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-B-OPT-DIR %s
 // CHECK-B-OPT-DIR: "{{.*}}/Inputs/B_opt_tree/dir2{{/|\\\\}}ld"
 //
 // RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
-// RUN:     -B %S/Inputs/B_opt_tree/dir3/prefix- 2>&1 \
+// RUN:     -B %S/Inputs/B_opt_tree/dir3/prefix- -fuse-ld=ld 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-B-OPT-PREFIX %s
 // CHECK-B-OPT-PREFIX: "{{.*}}/Inputs/B_opt_tree/dir3{{/|\\\\}}prefix-ld"
 //
 // RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
 // RUN:     -B %S/Inputs/B_opt_tree/dir3/prefix- \
-// RUN:     -B %S/Inputs/B_opt_tree/dir2 2>&1 \
+// RUN:     -B %S/Inputs/B_opt_tree/dir2 2>&1 -fuse-ld=ld \
 // RUN:   | FileCheck --check-prefix=CHECK-B-OPT-MULT %s
 // CHECK-B-OPT-MULT: "{{.*}}/Inputs/B_opt_tree/dir3{{/|\\\\}}prefix-ld"
diff --git a/test/Driver/coverage-ld.c b/test/Driver/coverage-ld.c
index 1eda5f1e9593..206d9abec6fd 100644
--- a/test/Driver/coverage-ld.c
+++ b/test/Driver/coverage-ld.c
@@ -1,7 +1,7 @@
 // Test coverage ld flags.
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux --coverage \
+// RUN:     -target i386-unknown-linux --coverage -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386 %s
@@ -10,7 +10,7 @@
 // CHECK-LINUX-I386: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-i386.a" {{.*}} "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux --coverage \
+// RUN:     -target x86_64-unknown-linux --coverage -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64 %s
@@ -19,7 +19,7 @@
 // CHECK-LINUX-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a" {{.*}} "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-freebsd --coverage \
+// RUN:     -target x86_64-unknown-freebsd --coverage -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd64_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-FREEBSD-X86-64 %s
@@ -28,7 +28,7 @@
 // CHECK-FREEBSD-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}freebsd{{/|\\\\}}libclang_rt.profile-x86_64.a"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm-linux-androideabi --coverage \
+// RUN:     -target arm-linux-androideabi --coverage -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-ARM %s
diff --git a/test/Driver/cross-linux.c b/test/Driver/cross-linux.c
index 3b1350489294..a5ea832e77ea 100644
--- a/test/Driver/cross-linux.c
+++ b/test/Driver/cross-linux.c
@@ -1,4 +1,4 @@
-// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as \
+// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as -fuse-ld=ld \
 // RUN:   --gcc-toolchain=%S/Inputs/basic_cross_linux_tree/usr \
 // RUN:   --target=i386-unknown-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-I386 %s
@@ -6,7 +6,7 @@
 // CHECK-I386: "{{.*}}/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/4.6.0/../../../../i386-unknown-linux-gnu/bin{{/|\\\\}}as" "--32"
 // CHECK-I386: "{{.*}}/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/4.6.0/../../../../i386-unknown-linux-gnu/bin{{/|\\\\}}ld" {{.*}} "-m" "elf_i386"
 //
-// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as \
+// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as -fuse-ld=ld \
 // RUN:   --gcc-toolchain=%S/Inputs/basic_cross_linux_tree/usr \
 // RUN:   --target=x86_64-unknown-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-X86-64 %s
@@ -14,7 +14,7 @@
 // CHECK-X86-64: "{{.*}}/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/4.6.0/../../../../x86_64-unknown-linux-gnu/bin{{/|\\\\}}as" "--64"
 // CHECK-X86-64: "{{.*}}/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/4.6.0/../../../../x86_64-unknown-linux-gnu/bin{{/|\\\\}}ld" {{.*}} "-m" "elf_x86_64"
 //
-// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as \
+// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as -fuse-ld=ld \
 // RUN:   --gcc-toolchain=%S/Inputs/basic_cross_linux_tree/usr \
 // RUN:   --target=x86_64-unknown-linux-gnux32 \
 // RUN:   | FileCheck --check-prefix=CHECK-X32 %s
@@ -22,17 +22,17 @@
 // CHECK-X32: "{{.*}}/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/4.6.0/../../../../x86_64-unknown-linux-gnu/bin{{/|\\\\}}as" "--x32"
 // CHECK-X32: "{{.*}}/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/4.6.0/../../../../x86_64-unknown-linux-gnu/bin{{/|\\\\}}ld" {{.*}} "-m" "elf32_x86_64"
 //
-// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as \
+// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as -fuse-ld=ld \
 // RUN:   --gcc-toolchain=%S/Inputs/basic_cross_linux_tree/usr \
 // RUN:   --target=x86_64-unknown-linux-gnu -m32 \
 // RUN:   | FileCheck --check-prefix=CHECK-I386 %s
 //
-// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as \
+// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as -fuse-ld=ld \
 // RUN:   --gcc-toolchain=%S/Inputs/basic_cross_linux_tree/usr \
 // RUN:   --target=i386-unknown-linux-gnu -m64 \
 // RUN:   | FileCheck --check-prefix=CHECK-X86-64 %s
 //
-// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as \
+// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as -fuse-ld=ld \
 // RUN:   --gcc-toolchain=%S/Inputs/multilib_32bit_linux_tree/usr \
 // RUN:   --target=i386-unknown-linux \
 // RUN:   --sysroot=%S/Inputs/basic_linux_tree \
@@ -49,7 +49,7 @@
 // CHECK-MULTI32-I386: "-L[[sysroot]]/lib"
 // CHECK-MULTI32-I386: "-L[[sysroot]]/usr/lib"
 //
-// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as \
+// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as -fuse-ld=ld \
 // RUN:   --gcc-toolchain=%S/Inputs/multilib_32bit_linux_tree/usr \
 // RUN:   --target=x86_64-unknown-linux \
 // RUN:   --sysroot=%S/Inputs/basic_linux_tree \
@@ -67,7 +67,7 @@
 // CHECK-MULTI32-X86-64: "-L[[sysroot]]/lib"
 // CHECK-MULTI32-X86-64: "-L[[sysroot]]/usr/lib"
 //
-// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as \
+// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as -fuse-ld=ld \
 // RUN:   --gcc-toolchain=%S/Inputs/multilib_64bit_linux_tree/usr \
 // RUN:   --target=i386-unknown-linux \
 // RUN:   --sysroot=%S/Inputs/basic_linux_tree \
@@ -85,7 +85,7 @@
 // CHECK-MULTI64-I386: "-L[[sysroot]]/lib"
 // CHECK-MULTI64-I386: "-L[[sysroot]]/usr/lib"
 //
-// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as \
+// RUN: %clang -### -o %t %s 2>&1 -no-integrated-as -fuse-ld=ld \
 // RUN:   --gcc-toolchain=%S/Inputs/multilib_64bit_linux_tree/usr \
 // RUN:   --target=x86_64-unknown-linux \
 // RUN:   --sysroot=%S/Inputs/basic_linux_tree \
diff --git a/test/Driver/fuchsia.c b/test/Driver/fuchsia.c
index 229b58828d0f..75172edf6b27 100644
--- a/test/Driver/fuchsia.c
+++ b/test/Driver/fuchsia.c
@@ -1,5 +1,5 @@
 // RUN: %clang %s -### -no-canonical-prefixes --target=x86_64-unknown-fuchsia \
-// RUN:     --sysroot=%S/platform 2>&1 | FileCheck %s
+// RUN:     --sysroot=%S/platform -fuse-ld=ld 2>&1 | FileCheck %s
 // CHECK: {{.*}}clang{{.*}}" "-cc1"
 // CHECK: "-fuse-init-array"
 // CHECK: "-isysroot" "[[SYSROOT:[^"]+]]"
diff --git a/test/Driver/fuchsia.cpp b/test/Driver/fuchsia.cpp
index 275891d52c30..4490f94d0715 100644
--- a/test/Driver/fuchsia.cpp
+++ b/test/Driver/fuchsia.cpp
@@ -1,5 +1,5 @@
 // RUN: %clangxx %s -### -no-canonical-prefixes --target=x86_64-unknown-fuchsia \
-// RUN:     --sysroot=%S/platform 2>&1 | FileCheck %s
+// RUN:     --sysroot=%S/platform 2>&1 -fuse-ld=ld | FileCheck %s
 // CHECK: {{.*}}clang{{.*}}" "-cc1"
 // CHECK: "-fuse-init-array"
 // CHECK: "-isysroot" "[[SYSROOT:[^"]+]]"
diff --git a/test/Driver/fuse-ld.c b/test/Driver/fuse-ld.c
index ca89eb997165..bd8c9a538624 100644
--- a/test/Driver/fuse-ld.c
+++ b/test/Driver/fuse-ld.c
@@ -32,7 +32,7 @@
 
 
-// RUN: %clang %s -### \
+// RUN: %clang %s -### -fuse-ld=ld \
 // RUN:     -target arm-linux-androideabi \
 // RUN:     -B%S/Inputs/basic_android_tree/bin 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ANDROID-ARM-LD
@@ -50,7 +50,7 @@
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-GOLD
 // CHECK-ANDROID-ARM-GOLD: Inputs/basic_android_tree/bin{{/|\\+}}arm-linux-androideabi-ld.gold
 
-// RUN: %clang %s -### \
+// RUN: %clang %s -### -fuse-ld=ld \
 // RUN:     -target arm-linux-androideabi \
 // RUN:     -gcc-toolchain %S/Inputs/basic_android_tree 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ANDROID-ARM-LD-TC
diff --git a/test/Driver/instrprof-ld.c b/test/Driver/instrprof-ld.c
index 05f65d614600..ea2010569975 100644
--- a/test/Driver/instrprof-ld.c
+++ b/test/Driver/instrprof-ld.c
@@ -1,7 +1,7 @@
 // Test instrumented profiling ld flags.
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux -fprofile-instr-generate \
+// RUN:     -target i386-unknown-linux -fprofile-instr-generate -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386 %s
@@ -10,7 +10,7 @@
 // CHECK-LINUX-I386: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-i386.a" {{.*}} "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -fprofile-instr-generate \
+// RUN:     -target x86_64-unknown-linux -fprofile-instr-generate -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64 %s
@@ -19,7 +19,7 @@
 // CHECK-LINUX-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a" {{.*}} "-lc"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -fprofile-instr-generate -nostdlib \
+// RUN:     -target x86_64-unknown-linux -fprofile-instr-generate -nostdlib -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-NOSTDLIB-X86-64 %s
@@ -28,7 +28,7 @@
 // CHECK-LINUX-NOSTDLIB-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-freebsd -fprofile-instr-generate \
+// RUN:     -target x86_64-unknown-freebsd -fprofile-instr-generate -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd64_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-FREEBSD-X86-64 %s
@@ -38,7 +38,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -shared \
-// RUN:     -target i386-unknown-linux -fprofile-instr-generate \
+// RUN:     -target i386-unknown-linux -fprofile-instr-generate -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386-SHARED %s
@@ -48,7 +48,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -shared \
-// RUN:     -target x86_64-unknown-linux -fprofile-instr-generate \
+// RUN:     -target x86_64-unknown-linux -fprofile-instr-generate -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64-SHARED %s
@@ -58,7 +58,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -shared \
-// RUN:     -target x86_64-unknown-freebsd -fprofile-instr-generate \
+// RUN:     -target x86_64-unknown-freebsd -fprofile-instr-generate -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd64_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-FREEBSD-X86-64-SHARED %s
@@ -67,7 +67,7 @@
 // CHECK-FREEBSD-X86-64-SHARED: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}freebsd{{/|\\\\}}libclang_rt.profile-x86_64.a"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-apple-darwin14 -fprofile-instr-generate \
+// RUN:     -target x86_64-apple-darwin14 -fprofile-instr-generate -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=CHECK-DARWIN-X86-64 %s
 //
@@ -75,7 +75,7 @@
 // CHECK-DARWIN-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}darwin{{/|\\\\}}libclang_rt.profile_osx.a"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-apple-darwin14 -fprofile-instr-generate -nostdlib \
+// RUN:     -target x86_64-apple-darwin14 -fprofile-instr-generate -nostdlib -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=CHECK-DARWIN-NOSTDLIB-X86-64 %s
 //
@@ -83,7 +83,7 @@
 // CHECK-DARWIN-NOSTDLIB-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}darwin{{/|\\\\}}libclang_rt.profile_osx.a"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm64-apple-ios -fprofile-instr-generate \
+// RUN:     -target arm64-apple-ios -fprofile-instr-generate -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=CHECK-DARWIN-ARM64 %s
 //
@@ -91,7 +91,7 @@
 // CHECK-DARWIN-ARM64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}darwin{{/|\\\\}}libclang_rt.profile_ios.a"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target armv7-apple-darwin -mtvos-version-min=8.3 -fprofile-instr-generate \
+// RUN:     -target armv7-apple-darwin -mtvos-version-min=8.3 -fprofile-instr-generate -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=CHECK-TVOS-ARMV7 %s
 //
@@ -99,7 +99,7 @@
 // CHECK-TVOS-ARMV7: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}darwin{{/|\\\\}}libclang_rt.profile_tvos.a"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target armv7s-apple-darwin10 -mwatchos-version-min=2.0 -arch armv7k -fprofile-instr-generate \
+// RUN:     -target armv7s-apple-darwin10 -mwatchos-version-min=2.0 -arch armv7k -fprofile-instr-generate -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=CHECK-WATCHOS-ARMV7 %s
 //
diff --git a/test/Driver/mips-mti-linux.c b/test/Driver/mips-mti-linux.c
index 4835d798c269..91a63e2f23cf 100644
--- a/test/Driver/mips-mti-linux.c
+++ b/test/Driver/mips-mti-linux.c
@@ -8,7 +8,7 @@
 
 // = Big-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux -mips32r2 -mhard-float -rtlib=platform \
+// RUN:     --target=mips-mti-linux -mips32r2 -mhard-float -rtlib=platform -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/mips_mti_linux/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R2 %s
 //
@@ -26,7 +26,7 @@
 
 // = Little-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux -mips32r2 -EL -mhard-float -rtlib=platform \
+// RUN:     --target=mips-mti-linux -mips32r2 -EL -mhard-float -rtlib=platform -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/mips_mti_linux/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-HF-32R2 %s
 //
diff --git a/test/Driver/netbsd.c b/test/Driver/netbsd.c
index 1a87d8e1a6a9..5558a80b9860 100644
--- a/test/Driver/netbsd.c
+++ b/test/Driver/netbsd.c
@@ -23,6 +23,12 @@
 // RUN: %clang -no-canonical-prefixes -target aarch64--netbsd7.0.0 \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=AARCH64-7 %s
+// RUN: %clang -no-canonical-prefixes -target aarch64_be--netbsd \
+// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=AARCH64_BE %s
+// RUN: %clang -no-canonical-prefixes -target aarch64_be--netbsd7.0.0 \
+// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=AARCH64_BE-7 %s
 // RUN: %clang -no-canonical-prefixes -target arm--netbsd-eabi \
 // RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=ARM %s
@@ -84,6 +90,12 @@
 // RUN: %clang -no-canonical-prefixes -target aarch64--netbsd7.0.0 -static \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-AARCH64-7 %s
+// RUN: %clang -no-canonical-prefixes -target aarch64_be--netbsd -static \
+// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=S-AARCH64_BE %s
+// RUN: %clang -no-canonical-prefixes -target aarch64_be--netbsd7.0.0 -static \
+// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=S-AARCH64_BE-7 %s
 // RUN: %clang -no-canonical-prefixes -target arm--netbsd-eabi -static \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-ARM %s
@@ -171,6 +183,18 @@
 // AARCH64-7:  "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
 // AARCH64-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
+// AARCH64_BE: clang{{.*}}" "-cc1" "-triple" "aarch64_be--netbsd"
+// AARCH64_BE: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
+// AARCH64_BE: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// AARCH64_BE: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
+// AARCH64_BE: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// AARCH64_BE-7: clang{{.*}}" "-cc1" "-triple" "aarch64_be--netbsd7.0.0"
+// AARCH64_BE-7: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
+// AARCH64_BE-7: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// AARCH64_BE-7:  "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
+// AARCH64_BE-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
 // ARM: clang{{.*}}" "-cc1" "-triple" "armv5e--netbsd-eabi"
 // ARM: as{{.*}}" "-mcpu=arm926ej-s" "-o"
 // ARM: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
@@ -311,6 +335,18 @@
 // S-AARCH64-7: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
 // S-AARCH64-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
+// S-AARCH64_BE: clang{{.*}}" "-cc1" "-triple" "aarch64_be--netbsd"
+// S-AARCH64_BE: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-AARCH64_BE: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// S-AARCH64_BE: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
+// S-AARCH64_BE: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// S-AARCH64_BE-7: clang{{.*}}" "-cc1" "-triple" "aarch64_be--netbsd7.0.0"
+// S-AARCH64_BE-7: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-AARCH64_BE-7: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// S-AARCH64_BE-7: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc"
+// S-AARCH64_BE-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
 // S-ARM: clang{{.*}}" "-cc1" "-triple" "armv5e--netbsd-eabi"
 // S-ARM: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
 // S-ARM: "-m" "armelf_nbsd_eabi"
diff --git a/test/Driver/netbsd.cpp b/test/Driver/netbsd.cpp
index 104d03eba191..e9b1759831b2 100644
--- a/test/Driver/netbsd.cpp
+++ b/test/Driver/netbsd.cpp
@@ -19,6 +19,12 @@
 // RUN: %clangxx -no-canonical-prefixes -target aarch64--netbsd7.0.0 \
 // RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=AARCH64-7 %s
+// RUN: %clangxx -no-canonical-prefixes -target aarch64_be--netbsd \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=AARCH64_BE %s
+// RUN: %clangxx -no-canonical-prefixes -target aarch64_be--netbsd7.0.0 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=AARCH64_BE-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd \
 // RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC %s
@@ -65,6 +71,12 @@
 // RUN: %clangxx -no-canonical-prefixes -target aarch64--netbsd7.0.0 -static \
 // RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-AARCH64-7 %s
+// RUN: %clangxx -no-canonical-prefixes -target aarch64_be--netbsd -static \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=S-AARCH64_BE %s
+// RUN: %clangxx -no-canonical-prefixes -target aarch64_be--netbsd7.0.0 -static \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=S-AARCH64_BE-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd -static \
 // RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-SPARC %s
@@ -136,6 +148,20 @@
 // AARCH64-7: "-lm" "-lc"
 // AARCH64-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
+// AARCH64_BE: clang{{.*}}" "-cc1" "-triple" "aarch64_be--netbsd"
+// AARCH64_BE: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
+// AARCH64_BE: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// AARCH64_BE: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// AARCH64_BE: "-lm" "-lc"
+// AARCH64_BE: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// AARCH64_BE-7: clang{{.*}}" "-cc1" "-triple" "aarch64_be--netbsd7.0.0"
+// AARCH64_BE-7: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
+// AARCH64_BE-7: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// AARCH64_BE-7: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// AARCH64_BE-7: "-lm" "-lc"
+// AARCH64_BE-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
 // SPARC: clang{{.*}}" "-cc1" "-triple" "sparc--netbsd"
 // SPARC: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // SPARC: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
@@ -241,6 +267,20 @@
 // S-AARCH64-7: "-lm" "-lc"
 // S-AARCH64-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
 
+// S-AARCH64_BE: clang{{.*}}" "-cc1" "-triple" "aarch64_be--netbsd"
+// S-AARCH64_BE: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-AARCH64_BE: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// S-AARCH64_BE: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// S-AARCH64_BE: "-lm" "-lc"
+// S-AARCH64_BE: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// S-AARCH64_BE-7: clang{{.*}}" "-cc1" "-triple" "aarch64_be--netbsd7.0.0"
+// S-AARCH64_BE-7: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
+// S-AARCH64_BE-7: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// S-AARCH64_BE-7: "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o" "{{.*}}.o" "-lc++"
+// S-AARCH64_BE-7: "-lm" "-lc"
+// S-AARCH64_BE-7: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
 // S-SPARC: clang{{.*}}" "-cc1" "-triple" "sparc--netbsd"
 // S-SPARC: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
 // S-SPARC: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
diff --git a/test/Driver/nostdlib.c b/test/Driver/nostdlib.c
index 7269312acba9..a9ef665c5744 100644
--- a/test/Driver/nostdlib.c
+++ b/test/Driver/nostdlib.c
@@ -13,12 +13,12 @@
 // In the presence of -nostdlib, the standard libraries should not be
 // passed down to link line
 // RUN: %clang -no-canonical-prefixes %s -### -Wno-liblto -o %t.o 2>&1 \
-// RUN:     -target i686-pc-linux-gnu -nostdlib --rtlib=compiler-rt \
+// RUN:     -target i686-pc-linux-gnu -nostdlib --rtlib=compiler-rt -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir -lclang_rt.builtins-i686 \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-NOSTDLIB %s
 //
 // RUN: %clang -no-canonical-prefixes %s -### -Wno-liblto -o %t.o 2>&1 \
-// RUN:     -target i686-pc-linux-gnu --rtlib=compiler-rt -nostdlib \
+// RUN:     -target i686-pc-linux-gnu --rtlib=compiler-rt -nostdlib -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir -lclang_rt.builtins-i686 \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-NOSTDLIB %s
 //
diff --git a/test/Driver/prefixed-tools.c b/test/Driver/prefixed-tools.c
index cdd59dae133d..63f7f29ae963 100644
--- a/test/Driver/prefixed-tools.c
+++ b/test/Driver/prefixed-tools.c
@@ -1,8 +1,8 @@
-// RUN: %clang -### -B%S/Inputs/prefixed_tools_tree -o %t.o -no-integrated-as \
+// RUN: %clang -### -B%S/Inputs/prefixed_tools_tree -o %t.o -no-integrated-as -fuse-ld=ld \
 // RUN:        -target x86_64--linux %s 2>&1 | \
 // RUN: FileCheck --check-prefix=CHECK-M64 %s
 
-// RUN: %clang -### -B%S/Inputs/prefixed_tools_tree -o %t.o -no-integrated-as \
+// RUN: %clang -### -B%S/Inputs/prefixed_tools_tree -o %t.o -no-integrated-as -fuse-ld=ld \
 // RUN:        -m32 -target x86_64--linux %s 2>&1 | \
 // RUN: FileCheck --check-prefix=CHECK-M32 %s
 
diff --git a/test/Driver/sanitizer-ld.c b/test/Driver/sanitizer-ld.c
index 91798b91b343..c98ce8a0c4c7 100644
--- a/test/Driver/sanitizer-ld.c
+++ b/test/Driver/sanitizer-ld.c
@@ -1,7 +1,7 @@
 // Test sanitizers ld flags.
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux -fsanitize=address \
+// RUN:     -target i386-unknown-linux -fuse-ld=ld -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX %s
@@ -17,7 +17,7 @@
 // CHECK-ASAN-LINUX: "-ldl"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux -fsanitize=address -shared-libasan \
+// RUN:     -target i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libasan \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-SHARED-ASAN-LINUX %s
@@ -34,7 +34,7 @@
 // CHECK-SHARED-ASAN-LINUX-NOT: "--dynamic-list"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.so -shared 2>&1 \
-// RUN:     -target i386-unknown-linux -fsanitize=address -shared-libasan \
+// RUN:     -target i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libasan \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-DSO-SHARED-ASAN-LINUX %s
@@ -51,7 +51,7 @@
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "--dynamic-list"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-freebsd -fsanitize=address \
+// RUN:     -target i386-unknown-freebsd -fuse-ld=ld -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-FREEBSD %s
@@ -67,7 +67,7 @@
 // CHECK-ASAN-FREEBSD: "-lrt"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-freebsd -fsanitize=address \
+// RUN:     -target i386-unknown-freebsd -fuse-ld=ld -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-FREEBSD-LDL %s
@@ -76,7 +76,7 @@
 // CHECK-ASAN-FREEBSD-LDL-NOT: "-ldl"
 
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux -stdlib=platform -fsanitize=address \
+// RUN:     -target i386-unknown-linux -fuse-ld=ld -stdlib=platform -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX %s
@@ -93,7 +93,7 @@
 // CHECK-ASAN-LINUX-CXX: "-ldl"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o /dev/null -fsanitize=address \
-// RUN:     -target i386-unknown-linux -stdlib=platform \
+// RUN:     -target i386-unknown-linux -fuse-ld=ld -stdlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree -lstdc++ -static 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC %s
 //
@@ -103,7 +103,7 @@
 // CHECK-ASAN-LINUX-CXX-STATIC: stdc++
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm-linux-gnueabi -fsanitize=address \
+// RUN:     -target arm-linux-gnueabi -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-ARM %s
 //
@@ -112,7 +112,7 @@
 // CHECK-ASAN-ARM: libclang_rt.asan-arm.a"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target armv7l-linux-gnueabi -fsanitize=address \
+// RUN:     -target armv7l-linux-gnueabi -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-ARMv7 %s
 //
@@ -121,7 +121,7 @@
 // CHECK-ASAN-ARMv7: libclang_rt.asan-arm.a"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm-linux-androideabi -fsanitize=address \
+// RUN:     -target arm-linux-androideabi -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-ANDROID %s
 //
@@ -141,7 +141,7 @@
 // CHECK-ASAN-ANDROID-SHARED-LIBASAN-NOT: argument unused during compilation: '-shared-libasan'
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm-linux-androideabi -fsanitize=address \
+// RUN:     -target arm-linux-androideabi -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-ANDROID-SHARED %s
@@ -152,7 +152,7 @@
 // CHECK-ASAN-ANDROID-SHARED-NOT: "-lpthread"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target sparcel-myriad-rtems-elf -fsanitize=address \
+// RUN:     -target sparcel-myriad-rtems-elf -fuse-ld=ld -fsanitize=address \
 // RUN:     --sysroot=%S/Inputs/basic_myriad_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-MYRIAD %s
 //
@@ -161,7 +161,7 @@
 // CHECK-ASAN-MYRIAD: libclang_rt.asan-sparcel.a"
 
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -stdlib=platform -lstdc++ \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld -stdlib=platform -lstdc++ \
 // RUN:     -fsanitize=thread \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
@@ -180,7 +180,7 @@
 // CHECK-TSAN-LINUX-CXX: "-ldl"
 
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -stdlib=platform -lstdc++ \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld -stdlib=platform -lstdc++ \
 // RUN:     -fsanitize=memory \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
@@ -199,7 +199,7 @@
 // CHECK-MSAN-LINUX-CXX: "-ldl"
 
 // RUN: %clang -fsanitize=undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux \
+// RUN:     -target i386-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX %s
 // CHECK-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -220,7 +220,7 @@
 // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++"
 
 // RUN: %clangxx -fsanitize=undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux -stdlib=platform \
+// RUN:     -target i386-unknown-linux -fuse-ld=ld -stdlib=platform \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-CXX %s
@@ -235,7 +235,7 @@
 // CHECK-UBSAN-LINUX-CXX: "-lpthread"
 
 // RUN: %clang -fsanitize=address,undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux \
+// RUN:     -target i386-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX %s
 // CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -245,7 +245,7 @@
 // CHECK-ASAN-UBSAN-LINUX: "-lpthread"
 
 // RUN: %clangxx -fsanitize=address,undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux -stdlib=platform \
+// RUN:     -target i386-unknown-linux -fuse-ld=ld -stdlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX %s
 // CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
@@ -256,7 +256,7 @@
 // CHECK-ASAN-UBSAN-LINUX-CXX: "-lpthread"
 
 // RUN: %clangxx -fsanitize=memory,undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX %s
 // CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
@@ -264,7 +264,7 @@
 // CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
 
 // RUN: %clangxx -fsanitize=thread,undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX %s
 // CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
@@ -272,7 +272,7 @@
 // CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
 
 // RUN: %clang -fsanitize=undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux \
+// RUN:     -target i386-unknown-linux -fuse-ld=ld \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     -shared \
@@ -283,7 +283,7 @@
 // CHECK-UBSAN-LINUX-SHARED-NOT: libclang_rt.ubsan
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -fsanitize=leak \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld -fsanitize=leak \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LSAN-LINUX %s
 //
@@ -295,7 +295,7 @@
 // CHECK-LSAN-LINUX: "-ldl"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:  -target x86_64-unknown-linux -fsanitize=leak -fsanitize-coverage=func \
+// RUN:  -target x86_64-unknown-linux -fuse-ld=ld -fsanitize=leak -fsanitize-coverage=func \
 // RUN:  --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LSAN-COV-LINUX %s
 //
@@ -308,7 +308,7 @@
 // CHECK-LSAN-COV-LINUX: "-ldl"
 
 // RUN: %clang -fsanitize=leak,address %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LSAN-ASAN-LINUX %s
 // CHECK-LSAN-ASAN-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -317,7 +317,7 @@
 // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan
 
 // RUN: %clang -fsanitize=address -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-COV-LINUX %s
 // CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -327,7 +327,7 @@
 // CHECK-ASAN-COV-LINUX: "-lpthread"
 
 // RUN: %clang -fsanitize=memory -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-MSAN-COV-LINUX %s
 // CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -337,7 +337,7 @@
 // CHECK-MSAN-COV-LINUX: "-lpthread"
 
 // RUN: %clang -fsanitize=dataflow -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-DFSAN-COV-LINUX %s
 // CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -347,7 +347,7 @@
 // CHECK-DFSAN-COV-LINUX: "-lpthread"
 
 // RUN: %clang -fsanitize=undefined -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-COV-LINUX %s
 // CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -356,7 +356,7 @@
 // CHECK-UBSAN-COV-LINUX: "-lpthread"
 
 // RUN: %clang -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-COV-LINUX %s
 // CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -366,7 +366,7 @@
 
 // CFI by itself does not link runtime libraries.
 // RUN: %clang -fsanitize=cfi %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -rtlib=platform \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld -rtlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-LINUX %s
 // CHECK-CFI-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -375,7 +375,7 @@
 // CFI with diagnostics links the UBSan runtime.
 // RUN: %clang -fsanitize=cfi -fno-sanitize-trap=cfi -fsanitize-recover=cfi \
 // RUN:     %s -### -o %t.o 2>&1\
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-DIAG-LINUX %s
 // CHECK-CFI-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -383,7 +383,7 @@
 
 // Cross-DSO CFI links the CFI runtime.
 // RUN: %clang -fsanitize=cfi -fsanitize-cfi-cross-dso %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-LINUX %s
 // CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -393,7 +393,7 @@
 // Cross-DSO CFI with diagnostics links just the CFI runtime.
 // RUN: %clang -fsanitize=cfi -fsanitize-cfi-cross-dso %s -### -o %t.o 2>&1 \
 // RUN:     -fno-sanitize-trap=cfi -fsanitize-recover=cfi \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX %s
 // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -402,7 +402,7 @@
 
 // RUN: %clangxx -fsanitize=address %s -### -o %t.o 2>&1 \
 // RUN:     -mmacosx-version-min=10.6 \
-// RUN:     -target x86_64-apple-darwin13.4.0 -stdlib=platform \
+// RUN:     -target x86_64-apple-darwin13.4.0 -fuse-ld=ld -stdlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-DARWIN106-CXX %s
 // CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}"
@@ -410,7 +410,7 @@
 // CHECK-ASAN-DARWIN106-CXX-NOT: -lc++abi
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -fsanitize=safe-stack \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld -fsanitize=safe-stack \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-SAFESTACK-LINUX %s
 //
@@ -421,7 +421,7 @@
 // CHECK-SAFESTACK-LINUX: "-ldl"
 
 // RUN: %clang -fsanitize=cfi -fsanitize-stats %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-LINUX %s
 // CHECK-CFI-STATS-LINUX: "{{.*}}ld{{(.exe)?}}"
@@ -430,7 +430,7 @@
 // CHECK-CFI-STATS-LINUX: "{{[^"]*}}libclang_rt.stats-x86_64.a"
 
 // RUN: %clang -fsanitize=cfi -fsanitize-stats %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-apple-darwin \
+// RUN:     -target x86_64-apple-darwin -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-DARWIN %s
 // CHECK-CFI-STATS-DARWIN: "{{.*}}ld{{(.exe)?}}"
@@ -454,7 +454,7 @@
 // CHECK-CFI-STATS-WIN32: "--linker-option=/include:___sanitizer_stats_register"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm-linux-androideabi -fsanitize=safe-stack \
+// RUN:     -target arm-linux-androideabi -fuse-ld=ld -fsanitize=safe-stack \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-SAFESTACK-ANDROID-ARM %s
 //
@@ -462,7 +462,7 @@
 // CHECK-SAFESTACK-ANDROID-ARM-NOT: libclang_rt.safestack
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o -shared 2>&1 \
-// RUN:     -target arm-linux-androideabi -fsanitize=safe-stack \
+// RUN:     -target arm-linux-androideabi -fuse-ld=ld -fsanitize=safe-stack \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-SAFESTACK-SHARED-ANDROID-ARM %s
 //
@@ -470,7 +470,7 @@
 // CHECK-SAFESTACK-SHARED-ANDROID-ARM-NOT: libclang_rt.safestack
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target aarch64-linux-android -fsanitize=safe-stack \
+// RUN:     -target aarch64-linux-android -fuse-ld=ld -fsanitize=safe-stack \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-SAFESTACK-ANDROID-AARCH64 %s
 //
@@ -478,7 +478,7 @@
 // CHECK-SAFESTACK-ANDROID-AARCH64-NOT: libclang_rt.safestack
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm-linux-androideabi -fsanitize=cfi \
+// RUN:     -target arm-linux-androideabi -fuse-ld=ld -fsanitize=cfi \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-ANDROID %s
 //
@@ -487,7 +487,7 @@
 // CHECK-CFI-ANDROID-NOT: __cfi_check
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm-linux-androideabi -fsanitize=cfi \
+// RUN:     -target arm-linux-androideabi -fuse-ld=ld -fsanitize=cfi \
 // RUN:     -fsanitize-cfi-cross-dso \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CROSSDSO-CFI-ANDROID %s
@@ -498,31 +498,31 @@
 // CHECK-CROSSDSO-CFI-ANDROID-NOT: libclang_rt.cfi
 
 // RUN: %clang -fsanitize=undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-scei-ps4 \
+// RUN:     -target x86_64-scei-ps4 -fuse-ld=ld \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-PS4 %s
 // CHECK-UBSAN-PS4: "{{.*}}ld{{(.gold)?(.exe)?}}"
 // CHECK-UBSAN-PS4: -lSceDbgUBSanitizer_stub_weak
 
 // RUN: %clang -fsanitize=address %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-scei-ps4 \
+// RUN:     -target x86_64-scei-ps4 -fuse-ld=ld \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-PS4 %s
 // CHECK-ASAN-PS4: "{{.*}}ld{{(.gold)?(.exe)?}}"
 // CHECK-ASAN-PS4: -lSceDbgAddressSanitizer_stub_weak
 
 // RUN: %clang -fsanitize=address,undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-scei-ps4 \
+// RUN:     -target x86_64-scei-ps4 -fuse-ld=ld \
 // RUN:     -shared \
 // RUN:   | FileCheck --check-prefix=CHECK-AUBSAN-PS4 %s
 // CHECK-AUBSAN-PS4: "{{.*}}ld{{(.gold)?(.exe)?}}"
 // CHECK-AUBSAN-PS4: -lSceDbgAddressSanitizer_stub_weak
 
 // RUN: %clang -fsanitize=efficiency-cache-frag %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:   | FileCheck --check-prefix=CHECK-ESAN-LINUX %s
 // RUN: %clang -fsanitize=efficiency-working-set %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux \
+// RUN:     -target x86_64-unknown-linux -fuse-ld=ld \
 // RUN:   | FileCheck --check-prefix=CHECK-ESAN-LINUX %s
 //
 // CHECK-ESAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
diff --git a/test/Driver/windows-cross.c b/test/Driver/windows-cross.c
index 84ef2df75d6f..5a2fe52b099e 100644
--- a/test/Driver/windows-cross.c
+++ b/test/Driver/windows-cross.c
@@ -1,34 +1,34 @@
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -stdlib=libstdc++ -rtlib=platform -o /dev/null %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -fuse-ld=ld -stdlib=libstdc++ -rtlib=platform -o /dev/null %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-BASIC
 
 // CHECK-BASIC: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-Bdynamic" "--entry" "mainCRTStartup" "--allow-multiple-definition" "-o" "{{[^"]*}}" "{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/crtbegin.obj" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/gcc" "{{.*}}.o" "-lmsvcrt" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %s/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -rtlib=compiler-rt -stdlib=libstdc++ -o /dev/null %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %s/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -fuse-ld=ld -rtlib=compiler-rt -stdlib=libstdc++ -o /dev/null %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-RTLIB
 
 // CHECK-RTLIB: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-Bdynamic" "--entry" "mainCRTStartup" "--allow-multiple-definition" "-o" "{{[^"]*}}" "{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/crtbegin.obj" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/gcc" "{{.*}}.o" "-lmsvcrt" "{{.*[\\/]}}clang_rt.builtins-arm.lib"
 
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -rtlib=compiler-rt -stdlib=libc++ -o /dev/null %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -fuse-ld=ld -rtlib=compiler-rt -stdlib=libc++ -o /dev/null %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-C-LIBCXX
 
 // CHECK-C-LIBCXX: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-Bdynamic" "--entry" "mainCRTStartup" "--allow-multiple-definition" "-o" "{{[^"]*}}" "{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/crtbegin.obj" "{{.*}}.o" "-lmsvcrt" "{{.*[\\/]}}clang_rt.builtins-arm.lib"
 
-// RUN: %clangxx -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -rtlib=compiler-rt -stdlib=libc++ -o /dev/null %s 2>&1 \
+// RUN: %clangxx -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -fuse-ld=ld -rtlib=compiler-rt -stdlib=libc++ -o /dev/null %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-LIBCXX
 
 // CHECK-LIBCXX: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-Bdynamic" "--entry" "mainCRTStartup" "--allow-multiple-definition" "-o" "{{[^"]*}}" "{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/crtbegin.obj" "{{.*}}.o" "-lc++" "-lmsvcrt" "{{.*[\\/]}}clang_rt.builtins-arm.lib"
 
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -shared -rtlib=compiler-rt -stdlib=libc++ -o shared.dll %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -fuse-ld=ld -shared -rtlib=compiler-rt -stdlib=libc++ -o shared.dll %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-SHARED
 
 // CHECK-SHARED: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-shared" "-Bdynamic" "--enable-auto-image-base" "--entry" "_DllMainCRTStartup" "--allow-multiple-definition" "-o" "shared.dll" "--out-implib" "shared.lib" "{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/crtbeginS.obj" "{{.*}}.o" "-lmsvcrt" "{{.*[\\/]}}clang_rt.builtins-arm.lib"
 
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %s/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -shared -rtlib=compiler-rt -stdlib=libc++ -nostartfiles -o shared.dll %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %s/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -fuse-ld=ld -shared -rtlib=compiler-rt -stdlib=libc++ -nostartfiles -o shared.dll %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-NOSTARTFILES
 
 // CHECK-NOSTARTFILES: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-shared" "-Bdynamic" "--enable-auto-image-base" "--entry" "_DllMainCRTStartup" "--allow-multiple-definition" "-o" "shared.dll" "--out-implib" "shared.lib" "{{.*}}.o" "-lmsvcrt" "{{.*[\\/]}}clang_rt.builtins-arm.lib"
 
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -shared -rtlib=compiler-rt -stdlib=libc++ -nostartfiles -nodefaultlibs -o shared.dll %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -fuse-ld=ld -shared -rtlib=compiler-rt -stdlib=libc++ -nostartfiles -nodefaultlibs -o shared.dll %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-STANDALONE
 
 // CHECK-STANDALONE: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-shared" "-Bdynamic" "--enable-auto-image-base" "--entry" "_DllMainCRTStartup" "--allow-multiple-definition" "-o" "shared.dll" "--out-implib" "shared.lib" "{{.*}}.o"
diff --git a/test/Index/Core/index-source.cpp b/test/Index/Core/index-source.cpp
index 7db5d531f43d..0e898d8c83ab 100644
--- a/test/Index/Core/index-source.cpp
+++ b/test/Index/Core/index-source.cpp
@@ -1,5 +1,16 @@
 // RUN: c-index-test core -print-source-symbols -- %s -std=c++14 -target x86_64-apple-macosx10.7 | FileCheck %s
 
+// CHECK: [[@LINE+1]]:7 | class/C++ | Cls | c:@S@Cls | <no-cgname> | Def | rel: 0
+class Cls {
+  // CHECK: [[@LINE+2]]:3 | constructor/C++ | Cls | c:@S@Cls@F@Cls#I# | __ZN3ClsC1Ei | Decl,RelChild | rel: 1
+  // CHECK-NEXT: RelChild | Cls | c:@S@Cls
+  Cls(int x);
+  // CHECK: [[@LINE+1]]:3 | constructor/cxx-copy-ctor/C++ | Cls | c:@S@Cls@F@Cls#&1$@S@Cls# | __ZN3ClsC1ERKS_ | Decl,RelChild | rel: 1
+  Cls(const Cls &);
+  // CHECK: [[@LINE+1]]:3 | constructor/cxx-move-ctor/C++ | Cls | c:@S@Cls@F@Cls#&&$@S@Cls# | __ZN3ClsC1EOS_ | Decl,RelChild | rel: 1
+  Cls(Cls &&);
+};
+
 template <typename TemplArg>
 class TemplCls {
 // CHECK: [[@LINE-1]]:7 | class(Gen)/C++ | TemplCls | c:@ST>1#T@TemplCls | <no-cgname> | Def | rel: 0
diff --git a/test/Misc/diag-template-diffing.cpp b/test/Misc/diag-template-diffing.cpp
index 780839899282..9006fc6f9ac9 100644
--- a/test/Misc/diag-template-diffing.cpp
+++ b/test/Misc/diag-template-diffing.cpp
@@ -1265,7 +1265,7 @@ void test() {
   foo<BoolT<true>>(X);
 }
 // CHECK-ELIDE-NOTREE: no matching function for call to 'foo'
-// CHECK-ELIDE-NOTREE: candidate function [with T = BoolArgumentBitExtended::BoolT<true>] not viable: no known conversion from 'BoolT<false>' to 'BoolT<true>' for 1st argument
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'BoolT<false>' to 'BoolT<true>' for 1st argument
 }
 
 namespace DifferentIntegralTypes {
@@ -1401,7 +1401,7 @@ void run() {
   f(1, integral_constant<bool, true>{});
 }
 // CHECK-ELIDE-NOTREE: error: no matching function for call to 'f'
-// CHECK-ELIDE-NOTREE: note: candidate function [with T = int] not viable: no known conversion from 'integral_constant<[...], true>' to 'integral_constant<[...], false>' for 2nd argument
+// CHECK-ELIDE-NOTREE: note: candidate function not viable: no known conversion from 'integral_constant<[...], true>' to 'integral_constant<[...], false>' for 2nd argument
 }
 
 namespace ZeroArgs {
@@ -1454,7 +1454,7 @@ void run() {
   D<X::X1>(VectorType<X::X2>());
 }
 // CHECK-ELIDE-NOTREE: error: no matching function for call to 'D'
-// CHECK-ELIDE-NOTREE: note: candidate function [with x = TypeAlias::X::X1] not viable: no known conversion from 'VectorType<X::X2>' to 'const VectorType<(TypeAlias::X)0>' for 1st argument
+// CHECK-ELIDE-NOTREE: note: candidate function not viable: no known conversion from 'VectorType<X::X2>' to 'const VectorType<(TypeAlias::X)0>' for 1st argument
 }
 
 namespace TypeAlias2 {
diff --git a/test/Modules/Inputs/pch-with-module-name/A.h b/test/Modules/Inputs/pch-with-module-name/A.h
new file mode 100644
index 000000000000..a73b3759d4ec
--- /dev/null
+++ b/test/Modules/Inputs/pch-with-module-name/A.h
@@ -0,0 +1 @@
+// in pch
diff --git a/test/Modules/Inputs/pch-with-module-name/C.h b/test/Modules/Inputs/pch-with-module-name/C.h
new file mode 100644
index 000000000000..f681dd80974a
--- /dev/null
+++ b/test/Modules/Inputs/pch-with-module-name/C.h
@@ -0,0 +1 @@
+#include "D.h"
diff --git a/test/Modules/Inputs/pch-with-module-name/C.m b/test/Modules/Inputs/pch-with-module-name/C.m
new file mode 100644
index 000000000000..90fe1bcc5851
--- /dev/null
+++ b/test/Modules/Inputs/pch-with-module-name/C.m
@@ -0,0 +1 @@
+//empty
diff --git a/test/Modules/Inputs/pch-with-module-name/D.h b/test/Modules/Inputs/pch-with-module-name/D.h
new file mode 100644
index 000000000000..90fe1bcc5851
--- /dev/null
+++ b/test/Modules/Inputs/pch-with-module-name/D.h
@@ -0,0 +1 @@
+//empty
diff --git a/test/Modules/Inputs/pch-with-module-name/module.modulemap b/test/Modules/Inputs/pch-with-module-name/module.modulemap
new file mode 100644
index 000000000000..379b0d48d589
--- /dev/null
+++ b/test/Modules/Inputs/pch-with-module-name/module.modulemap
@@ -0,0 +1,9 @@
+module CloudKit {
+  header "C.h"
+  export *
+}
+
+module Contacts {
+  header "D.h"
+  export *
+}
diff --git a/test/Modules/Inputs/pch-with-module-name/test.h b/test/Modules/Inputs/pch-with-module-name/test.h
new file mode 100644
index 000000000000..7a13ba4d72d4
--- /dev/null
+++ b/test/Modules/Inputs/pch-with-module-name/test.h
@@ -0,0 +1 @@
+#include "A.h"
diff --git a/test/Modules/pch-with-module-name.m b/test/Modules/pch-with-module-name.m
new file mode 100644
index 000000000000..c4096308c4f6
--- /dev/null
+++ b/test/Modules/pch-with-module-name.m
@@ -0,0 +1,5 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs/pch-with-module-name -emit-pch -o %t-A.pch %S/Inputs/pch-with-module-name/test.h -fmodule-name=Contacts -x objective-c-header
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs/pch-with-module-name -include-pch %t-A.pch %s -fsyntax-only -fmodule-name=Contacts -verify
+// expected-no-diagnostics 
+#include "C.h"
diff --git a/test/OpenMP/atomic_codegen.cpp b/test/OpenMP/atomic_codegen.cpp
index 536f2cdffafb..7f62a9bfa6fb 100644
--- a/test/OpenMP/atomic_codegen.cpp
+++ b/test/OpenMP/atomic_codegen.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -x c++ -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -x c++ -emit-llvm -std=c++98 %s -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -x c++ -emit-llvm -std=c++11 %s -o - | FileCheck %s
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
 // expected-no-diagnostics
 
@@ -21,14 +23,15 @@ void parallel_atomic_ewc() {
       // CHECK: [[SCALAR_ADDR:%.+]] = invoke dereferenceable(4) i32* @_ZN2St3getEv(%struct.St* [[TEMP_ST_ADDR]])
       // CHECK: [[SCALAR_VAL:%.+]] = load atomic i32, i32* [[SCALAR_ADDR]] monotonic
       // CHECK: store i32 [[SCALAR_VAL]], i32* @b
-      // CHECK: invoke void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
+      // CHECK98: invoke void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
+      // CHECK11: call void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
 #pragma omp atomic read
       b = St().get();
       // CHECK-DAG: invoke void @_ZN2StC1Ev(%struct.St* [[TEMP_ST_ADDR:%.+]])
       // CHECK-DAG: [[SCALAR_ADDR:%.+]] = invoke dereferenceable(4) i32* @_ZN2St3getEv(%struct.St* [[TEMP_ST_ADDR]])
       // CHECK-DAG: [[B_VAL:%.+]] = load i32, i32* @b
       // CHECK: store atomic i32 [[B_VAL]], i32* [[SCALAR_ADDR]] monotonic
-      // CHECK: invoke void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
+      // CHECK: {{invoke|call}} void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
 #pragma omp atomic write
       St().get() = b;
       // CHECK: invoke void @_ZN2StC1Ev(%struct.St* [[TEMP_ST_ADDR:%.+]])
@@ -46,7 +49,7 @@ void parallel_atomic_ewc() {
       // CHECK: [[COND:%.+]] = extractvalue { i32, i1 } [[RES]], 1
       // CHECK: br i1 [[COND]], label %[[OMP_DONE:.+]], label %[[OMP_UPDATE]]
       // CHECK: [[OMP_DONE]]
-      // CHECK: invoke void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
+      // CHECK: {{invoke|call}} void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
 #pragma omp atomic
       St().get() %= b;
 #pragma omp atomic
@@ -67,7 +70,7 @@ void parallel_atomic_ewc() {
       // CHECK: br i1 [[COND]], label %[[OMP_DONE:.+]], label %[[OMP_UPDATE]]
       // CHECK: [[OMP_DONE]]
       // CHECK: store i32 [[NEW_CALC_VAL]], i32* @a,
-      // CHECK: invoke void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
+      // CHECK: {{invoke|call}} void @_ZN2StD1Ev(%struct.St* [[TEMP_ST_ADDR]])
 #pragma omp atomic capture
       a = St().get() %= b;
     }
diff --git a/test/OpenMP/threadprivate_codegen.cpp b/test/OpenMP/threadprivate_codegen.cpp
index d2cbc154cc83..09f5ed5060ba 100644
--- a/test/OpenMP/threadprivate_codegen.cpp
+++ b/test/OpenMP/threadprivate_codegen.cpp
@@ -275,7 +275,7 @@ S1 arr_x[2][3] = { { 1, 2, 3 }, { 4, 5, 6 } };
 // CHECK:      {{.*}}[[ARR_LOOP]]{{.*}}
 // CHECK-NEXT: [[ARR_ELEMENTPAST:%.*]] = phi [[S1]]* [ [[ARR_CUR]], {{.*}} ], [ [[ARR_ELEMENT:%.*]], {{.*}} ]
 // CHECK-NEXT: [[ARR_ELEMENT:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[ARR_ELEMENTPAST]], i{{.*}} -1
-// CHECK-NEXT: invoke {{.*}} [[S1_DTOR]]([[S1]]* [[ARR_ELEMENT]])
+// CHECK-NEXT: {{call|invoke}} {{.*}} [[S1_DTOR]]([[S1]]* [[ARR_ELEMENT]])
 // CHECK:      [[ARR_DONE:%.*]] = icmp eq [[S1]]* [[ARR_ELEMENT]], [[ARR_BEGIN]]
 // CHECK-NEXT: br i1 [[ARR_DONE]], label %[[ARR_EXIT:.*]], label %[[ARR_LOOP]]
 // CHECK:      {{.*}}[[ARR_EXIT]]{{.*}}
diff --git a/test/Sema/diagnose_if.c b/test/Sema/diagnose_if.c
new file mode 100644
index 000000000000..219e393bc0cc
--- /dev/null
+++ b/test/Sema/diagnose_if.c
@@ -0,0 +1,152 @@
+// RUN: %clang_cc1 %s -verify -fno-builtin
+
+#define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
+
+void failure() _diagnose_if(); // expected-error{{exactly 3 arguments}}
+void failure() _diagnose_if(0); // expected-error{{exactly 3 arguments}}
+void failure() _diagnose_if(0, ""); // expected-error{{exactly 3 arguments}}
+void failure() _diagnose_if(0, "", "error", 1); // expected-error{{exactly 3 arguments}}
+void failure() _diagnose_if(0, 0, "error"); // expected-error{{requires a string}}
+void failure() _diagnose_if(0, "", "invalid"); // expected-error{{invalid diagnostic type for 'diagnose_if'; use "error" or "warning" instead}}
+void failure() _diagnose_if(0, "", "ERROR"); // expected-error{{invalid diagnostic type}}
+void failure(int a) _diagnose_if(a, "", ""); // expected-error{{invalid diagnostic type}}
+void failure() _diagnose_if(a, "", ""); // expected-error{{undeclared identifier 'a'}}
+
+int globalVar;
+void never_constant() _diagnose_if(globalVar, "", "error"); // expected-error{{'diagnose_if' attribute expression never produces a constant expression}} expected-note{{subexpression not valid}}
+void never_constant() _diagnose_if(globalVar, "", "warning"); // expected-error{{'diagnose_if' attribute expression never produces a constant expression}} expected-note{{subexpression not valid}}
+
+int alwaysok(int q) _diagnose_if(0, "", "error");
+int neverok(int q) _diagnose_if(1, "oh no", "error"); // expected-note 5{{from 'diagnose_if' attribute on 'neverok'}}
+int alwayswarn(int q) _diagnose_if(1, "oh no", "warning"); // expected-note 5{{from 'diagnose_if' attribute}}
+int neverwarn(int q) _diagnose_if(0, "", "warning");
+
+void runConstant() {
+  int m;
+  alwaysok(0);
+  alwaysok(1);
+  alwaysok(m);
+
+  {
+    int (*pok)(int) = alwaysok;
+    pok = &alwaysok;
+  }
+
+  neverok(0); // expected-error{{oh no}}
+  neverok(1); // expected-error{{oh no}}
+  neverok(m); // expected-error{{oh no}}
+  {
+    int (*pok)(int) = neverok; // expected-error{{oh no}}
+    pok = &neverok; // expected-error{{oh no}}
+  }
+
+  alwayswarn(0); // expected-warning{{oh no}}
+  alwayswarn(1); // expected-warning{{oh no}}
+  alwayswarn(m); // expected-warning{{oh no}}
+  {
+    int (*pok)(int) = alwayswarn; // expected-warning{{oh no}}
+    pok = &alwayswarn; // expected-warning{{oh no}}
+  }
+
+  neverwarn(0);
+  neverwarn(1);
+  neverwarn(m);
+  {
+    int (*pok)(int) = neverwarn;
+    pok = &neverwarn;
+  }
+}
+
+int abs(int q) _diagnose_if(q >= 0, "redundant abs call", "error"); //expected-note{{from 'diagnose_if'}}
+void runVariable() {
+  int m;
+  abs(-1);
+  abs(1); // expected-error{{redundant abs call}}
+  abs(m);
+
+  int (*pabs)(int) = abs;
+  pabs = &abs;
+}
+
+#define _overloadable __attribute__((overloadable))
+
+int ovl1(const char *n) _overloadable _diagnose_if(n, "oh no", "error"); // expected-note{{oh no}}
+int ovl1(void *m) _overloadable; // expected-note{{candidate function}}
+
+int ovl2(const char *n) _overloadable _diagnose_if(n, "oh no", "error"); // expected-note{{candidate function}}
+int ovl2(char *m) _overloadable; // expected-note{{candidate function}}
+void overloadsYay() {
+  ovl1((void *)0);
+  ovl1(""); // expected-error{{call to unavailable function}}
+
+  ovl2((void *)0); // expected-error{{ambiguous}}
+}
+
+void errorWarnDiagnose1() _diagnose_if(1, "oh no", "error") // expected-note{{from 'diagnose_if'}}
+  _diagnose_if(1, "nop", "warning");
+void errorWarnDiagnose2() _diagnose_if(1, "oh no", "error") // expected-note{{from 'diagnose_if'}}
+  _diagnose_if(1, "nop", "error");
+void errorWarnDiagnose3() _diagnose_if(1, "nop", "warning")
+  _diagnose_if(1, "oh no", "error"); // expected-note{{from 'diagnose_if'}}
+
+void errorWarnDiagnoseArg1(int a) _diagnose_if(a == 1, "oh no", "error") // expected-note{{from 'diagnose_if'}}
+  _diagnose_if(a == 1, "nop", "warning");
+void errorWarnDiagnoseArg2(int a) _diagnose_if(a == 1, "oh no", "error") // expected-note{{from 'diagnose_if'}}
+  _diagnose_if(a == 1, "nop", "error");
+void errorWarnDiagnoseArg3(int a) _diagnose_if(a == 1, "nop", "warning")
+  _diagnose_if(a == 1, "oh no", "error"); // expected-note{{from 'diagnose_if'}}
+
+void runErrorWarnDiagnose() {
+  errorWarnDiagnose1(); // expected-error{{oh no}}
+  errorWarnDiagnose2(); // expected-error{{oh no}}
+  errorWarnDiagnose3(); // expected-error{{oh no}}
+
+  errorWarnDiagnoseArg1(1); // expected-error{{oh no}}
+  errorWarnDiagnoseArg2(1); // expected-error{{oh no}}
+  errorWarnDiagnoseArg3(1); // expected-error{{oh no}}
+}
+
+void warnWarnDiagnose() _diagnose_if(1, "oh no!", "warning") _diagnose_if(1, "foo", "warning"); // expected-note 2{{from 'diagnose_if'}}
+void runWarnWarnDiagnose() {
+  warnWarnDiagnose(); // expected-warning{{oh no!}} expected-warning{{foo}}
+}
+
+void declsStackErr1(int a) _diagnose_if(a & 1, "decl1", "error"); // expected-note 2{{from 'diagnose_if'}}
+void declsStackErr1(int a) _diagnose_if(a & 2, "decl2", "error"); // expected-note{{from 'diagnose_if'}}
+void declsStackErr2();
+void declsStackErr2() _diagnose_if(1, "complaint", "error"); // expected-note{{from 'diagnose_if'}}
+void declsStackErr3() _diagnose_if(1, "complaint", "error"); // expected-note{{from 'diagnose_if'}}
+void declsStackErr3();
+void runDeclsStackErr() {
+  declsStackErr1(0);
+  declsStackErr1(1); // expected-error{{decl1}}
+  declsStackErr1(2); // expected-error{{decl2}}
+  declsStackErr1(3); // expected-error{{decl1}}
+  declsStackErr2(); // expected-error{{complaint}}
+  declsStackErr3(); // expected-error{{complaint}}
+}
+
+void declsStackWarn1(int a) _diagnose_if(a & 1, "decl1", "warning"); // expected-note 2{{from 'diagnose_if'}}
+void declsStackWarn1(int a) _diagnose_if(a & 2, "decl2", "warning"); // expected-note 2{{from 'diagnose_if'}}
+void declsStackWarn2();
+void declsStackWarn2() _diagnose_if(1, "complaint", "warning"); // expected-note{{from 'diagnose_if'}}
+void declsStackWarn3() _diagnose_if(1, "complaint", "warning"); // expected-note{{from 'diagnose_if'}}
+void declsStackWarn3();
+void runDeclsStackWarn() {
+  declsStackWarn1(0);
+  declsStackWarn1(1); // expected-warning{{decl1}}
+  declsStackWarn1(2); // expected-warning{{decl2}}
+  declsStackWarn1(3); // expected-warning{{decl1}} expected-warning{{decl2}}
+  declsStackWarn2(); // expected-warning{{complaint}}
+  declsStackWarn3(); // expected-warning{{complaint}}
+}
+
+void noMsg(int n) _diagnose_if(n, "", "warning"); // expected-note{{from 'diagnose_if'}}
+void runNoMsg() {
+  noMsg(1); // expected-warning{{<no message provided>}}
+}
+
+void alwaysWarnWithArg(int a) _diagnose_if(1 || a, "alwaysWarn", "warning"); // expected-note{{from 'diagnose_if'}}
+void runAlwaysWarnWithArg(int a) {
+  alwaysWarnWithArg(a); // expected-warning{{alwaysWarn}}
+}
diff --git a/test/SemaCXX/PR10177.cpp b/test/SemaCXX/PR10177.cpp
index 9286e2935167..59630be50885 100644
--- a/test/SemaCXX/PR10177.cpp
+++ b/test/SemaCXX/PR10177.cpp
@@ -24,6 +24,13 @@ void f() {
   (void)class_ref<int, int&, U<2>::a>(); // expected-note {{here}}
 };
 
+template<typename T>
+void not_instantiated() {
+  // These cases (arguably) do not require instantiation of U<i>::a.
+  (void)alias_ref<int, int&, U<3>::a>();
+  (void)func_ref<int, int&, U<4>::a>();
+  (void)class_ref<int, int&, U<5>::a>();
+};
 
 template<int N>
 void fi() {
@@ -33,7 +40,7 @@ void fi() {
 };
 
 int main() {
-  f<int>();   // NOTE: Non-dependent name uses are type-checked at template definition time.
+  f<int>();   // expected-note 3{{here}}
   fi<10>();   // expected-note 3{{here}}
 }
 
diff --git a/test/SemaCXX/attr-mode-tmpl.cpp b/test/SemaCXX/attr-mode-tmpl.cpp
index 4e1489a8a5bd..d83bb3989050 100644
--- a/test/SemaCXX/attr-mode-tmpl.cpp
+++ b/test/SemaCXX/attr-mode-tmpl.cpp
@@ -45,7 +45,7 @@ void CheckMachineMode() {
 
 // Check attributes on function parameters.
 template <class T1, class T2>
-void CheckParameters(T1 __attribute__((mode(SI)))   paramSI,     // expected-note2{{ignored: substitution failure}}
+void CheckParameters(T1 __attribute__((mode(SI)))   paramSI,     // expected-note{{ignored: substitution failure}} expected-note-re{{not viable: no known conversion from '{{.*}}' (vector of 4 '{{.*}}' values) to 'EnumType' for 2nd argument}}
                      T1 __attribute__((mode(V4DI))) paramV4DI,   // expected-warning{{deprecated}}
                      T2 __attribute__((mode(SF)))   paramSF,
                      T2 __attribute__((mode(V4DF))) paramV4DF) { // expected-warning{{deprecated}}
diff --git a/test/SemaCXX/attr-noreturn.cpp b/test/SemaCXX/attr-noreturn.cpp
index a8e71db73702..6edc86c43a7d 100644
--- a/test/SemaCXX/attr-noreturn.cpp
+++ b/test/SemaCXX/attr-noreturn.cpp
@@ -244,11 +244,11 @@ namespace PR15291 {
   template <typename T>
   void qux(T) {}
 
-  // expected-note@+5 {{candidate function [with T = void (*)(int) __attribute__((noreturn))] not viable: no overload of 'baz' matching 'void (*)(int) __attribute__((noreturn))' for 1st argument}}
-  // expected-note@+4 {{candidate function [with T = void (*)(int) __attribute__((noreturn))] not viable: no overload of 'qux' matching 'void (*)(int) __attribute__((noreturn))' for 1st argument}}
-  // expected-note@+3 {{candidate function [with T = void (*)(int) __attribute__((noreturn))] not viable: no overload of 'bar' matching 'void (*)(int) __attribute__((noreturn))' for 1st argument}}
-  // expected-note@+2 {{candidate function [with T = void (*)(int)] not viable: no overload of 'bar' matching 'void (*)(int)' for 1st argument}}
-  // expected-note@+1 {{candidate function [with T = void (int)] not viable: no overload of 'bar' matching 'void (*)(int)' for 1st argument}}
+  // expected-note@+5 {{candidate function not viable: no overload of 'baz' matching 'void (*)(int) __attribute__((noreturn))' for 1st argument}}
+  // expected-note@+4 {{candidate function not viable: no overload of 'qux' matching 'void (*)(int) __attribute__((noreturn))' for 1st argument}}
+  // expected-note@+3 {{candidate function not viable: no overload of 'bar' matching 'void (*)(int) __attribute__((noreturn))' for 1st argument}}
+  // expected-note@+2 {{candidate function not viable: no overload of 'bar' matching 'void (*)(int)' for 1st argument}}
+  // expected-note@+1 {{candidate function not viable: no overload of 'bar' matching 'void (*)(int)' for 1st argument}}
   template <typename T> void accept_T(T) {}
 
   // expected-note@+1 {{candidate function not viable: no overload of 'bar' matching 'void (*)(int)' for 1st argument}}
diff --git a/test/SemaCXX/constant-expression-cxx11.cpp b/test/SemaCXX/constant-expression-cxx11.cpp
index 884f2f30c42f..066832440c75 100644
--- a/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1902,9 +1902,9 @@ namespace ZeroSizeTypes {
 namespace BadDefaultInit {
   template<int N> struct X { static const int n = N; };
 
-  struct A { // expected-error {{default member initializer for 'k' needed within definition of enclosing class}}
+  struct A {
     int k = // expected-note {{default member initializer declared here}}
-        X<A().k>::n; // expected-error {{not a constant expression}} expected-note {{implicit default constructor for 'BadDefaultInit::A' first required here}}
+        X<A().k>::n; // expected-error {{default member initializer for 'k' needed within definition of enclosing class}}
   };
 
   // FIXME: The "constexpr constructor must initialize all members" diagnostic
diff --git a/test/SemaCXX/cxx1z-constexpr-lambdas.cpp b/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
index 90a07665cbf7..16d5730d3d4c 100644
--- a/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
+++ b/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
@@ -59,4 +59,118 @@ void f(char c) { //expected-note{{declared here}}
 }
 
 }
+
+namespace test_conversion_function_for_non_capturing_lambdas {
+
+namespace ns1 {
+auto L = [](int i) { return i; };
+constexpr int (*fpi)(int) = L;
+static_assert(fpi(3) == 3);
+auto GL = [](auto a) { return a; };
+
+constexpr char (*fp2)(char) = GL;
+constexpr double (*fp3)(double) = GL;
+constexpr const char* (*fp4)(const char*) = GL;
+static_assert(fp2('3') == '3');
+static_assert(fp3(3.14) == 3.14);
+constexpr const char *Str = "abc";
+static_assert(fp4(Str) == Str);
+
+auto NCL = [](int i) { static int j; return j; }; //expected-note{{declared here}}
+constexpr int (*fp5)(int) = NCL;
+constexpr int I =  //expected-error{{must be initialized by a constant expression}}
+                  fp5(5); //expected-note{{non-constexpr function}} 
+
+namespace test_dont_always_instantiate_constexpr_templates {
+
+auto explicit_return_type = [](auto x) -> int { return x.get(); };
+decltype(explicit_return_type(0)) c;  // OK
+
+auto deduced_return_type = [](auto x) { return x.get(); }; //expected-error{{not a structure or union}}
+decltype(deduced_return_type(0)) d;  //expected-note{{requested here}}
+
+
+  
+} // end ns test_dont_always_instantiate_constexpr_templates
+} // end ns1
+
+} // end ns test_conversion_function_for_non_capturing_lambdas
+
+namespace test_lambda_is_cce {
+namespace ns1_simple_lambda {
+
+namespace ns0 {
+constexpr int I = [](auto a) { return a; }(10);
+
+static_assert(I == 10); 
+static_assert(10 == [](auto a) { return a; }(10));
+static_assert(3.14 == [](auto a) { return a; }(3.14));
+
+} //end ns0
+
+namespace ns1 {
+constexpr auto f(int i) {
+  double d = 3.14;
+  auto L = [=](auto a) { 
+    int Isz = sizeof(i);
+    return sizeof(i) + sizeof(a) + sizeof(d); 
+  };
+  int I = L("abc") + L(nullptr);
+  return L;
+}
+constexpr auto L = f(3);
+constexpr auto M =  L("abc") + L(nullptr);
+
+static_assert(M == sizeof(int) * 2 + sizeof(double) * 2 + sizeof(nullptr) + sizeof(const char*));
+
+} // end ns1
+
+namespace ns2 {
+constexpr auto f(int i) {
+  auto L = [](auto a) { return a + a; };
+  return L;
+}
+constexpr auto L = f(3);
+constexpr int I = L(6);
+static_assert(I == 12);
+} // end ns2
+
+namespace contained_lambdas_call_operator_is_not_constexpr {
+constexpr auto f(int i) {
+  double d = 3.14;
+  auto L = [=](auto a) { //expected-note{{declared here}}
+    int Isz = sizeof(i);
+    asm("hello");
+    return sizeof(i) + sizeof(a) + sizeof(d); 
+  };
+  return L;
+}
+
+constexpr auto L = f(3);
+
+constexpr auto M =  // expected-error{{must be initialized by}} 
+    L("abc"); //expected-note{{non-constexpr function}}
+
+} // end ns contained_lambdas_call_operator_is_not_constexpr
+
+
+
+} // end ns1_simple_lambda
+
+namespace ns1_unimplemented {
+namespace ns1_captures {
+constexpr auto f(int i) {
+  double d = 3.14;
+  auto L = [=](auto a) { //expected-note{{coming soon}}
+    int Isz = i + d;
+    return sizeof(i) + sizeof(a) + sizeof(d); 
+  };
+  return L;
+}
+constexpr auto M = f(3);  //expected-error{{constant expression}} expected-note{{in call to}}
+} // end ns1_captures
+} // end ns1_unimplemented 
+
+} // end ns test_lambda_is_cce
+
 #endif // ndef CPP14_AND_EARLIER
diff --git a/test/SemaCXX/diagnose_if.cpp b/test/SemaCXX/diagnose_if.cpp
new file mode 100644
index 000000000000..f97b79d03529
--- /dev/null
+++ b/test/SemaCXX/diagnose_if.cpp
@@ -0,0 +1,460 @@
+// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14
+
+#define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
+
+namespace type_dependent {
+template <typename T>
+void neverok() _diagnose_if(!T(), "oh no", "error") {} // expected-note 4{{from 'diagnose_if'}}
+
+template <typename T>
+void alwaysok() _diagnose_if(T(), "oh no", "error") {}
+
+template <typename T>
+void alwayswarn() _diagnose_if(!T(), "oh no", "warning") {} // expected-note 4{{from 'diagnose_if'}}
+
+template <typename T>
+void neverwarn() _diagnose_if(T(), "oh no", "warning") {}
+
+void runAll() {
+  alwaysok<int>();
+  alwaysok<int>();
+
+  {
+    void (*pok)() = alwaysok<int>;
+    pok = &alwaysok<int>;
+  }
+
+  neverok<int>(); // expected-error{{oh no}}
+  neverok<short>(); // expected-error{{oh no}}
+
+  {
+    void (*pok)() = neverok<int>; // expected-error{{oh no}}
+  }
+  {
+    void (*pok)();
+    pok = &neverok<int>; // expected-error{{oh no}}
+  }
+
+  alwayswarn<int>(); // expected-warning{{oh no}}
+  alwayswarn<short>(); // expected-warning{{oh no}}
+  {
+    void (*pok)() = alwayswarn<int>; // expected-warning{{oh no}}
+    pok = &alwayswarn<int>; // expected-warning{{oh no}}
+  }
+
+  neverwarn<int>();
+  neverwarn<short>();
+  {
+    void (*pok)() = neverwarn<int>;
+    pok = &neverwarn<int>;
+  }
+}
+
+template <typename T>
+void errorIf(T a) _diagnose_if(T() != a, "oh no", "error") {} // expected-note {{candidate disabled: oh no}}
+
+template <typename T>
+void warnIf(T a) _diagnose_if(T() != a, "oh no", "warning") {} // expected-note {{from 'diagnose_if'}}
+
+void runIf() {
+  errorIf(0);
+  errorIf(1); // expected-error{{call to unavailable function}}
+
+  warnIf(0);
+  warnIf(1); // expected-warning{{oh no}}
+}
+}
+
+namespace value_dependent {
+template <int N>
+void neverok() _diagnose_if(N == 0 || N != 0, "oh no", "error") {} // expected-note 4{{from 'diagnose_if'}}
+
+template <int N>
+void alwaysok() _diagnose_if(N == 0 && N != 0, "oh no", "error") {}
+
+template <int N>
+void alwayswarn() _diagnose_if(N == 0 || N != 0, "oh no", "warning") {} // expected-note 4{{from 'diagnose_if'}}
+
+template <int N>
+void neverwarn() _diagnose_if(N == 0 && N != 0, "oh no", "warning") {}
+
+void runAll() {
+  alwaysok<0>();
+  alwaysok<1>();
+
+  {
+    void (*pok)() = alwaysok<0>;
+    pok = &alwaysok<0>;
+  }
+
+  neverok<0>(); // expected-error{{oh no}}
+  neverok<1>(); // expected-error{{oh no}}
+
+  {
+    void (*pok)() = neverok<0>; // expected-error{{oh no}}
+  }
+  {
+    void (*pok)();
+    pok = &neverok<0>; // expected-error{{oh no}}
+  }
+
+  alwayswarn<0>(); // expected-warning{{oh no}}
+  alwayswarn<1>(); // expected-warning{{oh no}}
+  {
+    void (*pok)() = alwayswarn<0>; // expected-warning{{oh no}}
+    pok = &alwayswarn<0>; // expected-warning{{oh no}}
+  }
+
+  neverwarn<0>();
+  neverwarn<1>();
+  {
+    void (*pok)() = neverwarn<0>;
+    pok = &neverwarn<0>;
+  }
+}
+
+template <int N>
+void errorIf(int a) _diagnose_if(N != a, "oh no", "error") {} // expected-note {{candidate disabled: oh no}}
+
+template <int N>
+void warnIf(int a) _diagnose_if(N != a, "oh no", "warning") {} // expected-note {{from 'diagnose_if'}}
+
+void runIf() {
+  errorIf<0>(0);
+  errorIf<0>(1); // expected-error{{call to unavailable function}}
+
+  warnIf<0>(0);
+  warnIf<0>(1); // expected-warning{{oh no}}
+}
+}
+
+namespace no_overload_interaction {
+void foo(int) _diagnose_if(1, "oh no", "error"); // expected-note{{from 'diagnose_if'}}
+void foo(short);
+
+void bar(int);
+void bar(short) _diagnose_if(1, "oh no", "error");
+
+void fooArg(int a) _diagnose_if(a, "oh no", "error"); // expected-note{{candidate disabled: oh no}}
+void fooArg(short); // expected-note{{candidate function}}
+
+void barArg(int);
+void barArg(short a) _diagnose_if(a, "oh no", "error");
+
+void runAll() {
+  foo(1); // expected-error{{oh no}}
+  bar(1);
+
+  fooArg(1); // expected-error{{call to unavailable function}}
+  barArg(1);
+
+  auto p = foo; // expected-error{{incompatible initializer of type '<overloaded function type>'}}
+}
+}
+
+namespace with_default_args {
+void foo(int a = 0) _diagnose_if(a, "oh no", "warning"); // expected-note 1{{from 'diagnose_if'}}
+void bar(int a = 1) _diagnose_if(a, "oh no", "warning"); // expected-note 2{{from 'diagnose_if'}}
+
+void runAll() {
+  foo();
+  foo(0);
+  foo(1); // expected-warning{{oh no}}
+
+  bar(); // expected-warning{{oh no}}
+  bar(0);
+  bar(1); // expected-warning{{oh no}}
+}
+}
+
+namespace naked_mem_expr {
+struct Foo {
+  void foo(int a) _diagnose_if(a, "should warn", "warning"); // expected-note{{from 'diagnose_if'}}
+  void bar(int a) _diagnose_if(a, "oh no", "error"); // expected-note{{from 'diagnose_if'}}
+};
+
+void runFoo() {
+  Foo().foo(0);
+  Foo().foo(1); // expected-warning{{should warn}}
+
+  Foo().bar(0);
+  Foo().bar(1); // expected-error{{oh no}}
+}
+}
+
+namespace class_template {
+template <typename T>
+struct Errors {
+  void foo(int i) _diagnose_if(i, "bad i", "error"); // expected-note{{from 'diagnose_if'}}
+  void bar(int i) _diagnose_if(i != T(), "bad i", "error"); // expected-note{{from 'diagnose_if'}}
+
+  void fooOvl(int i) _diagnose_if(i, "int bad i", "error"); // expected-note 2{{int bad i}}
+  void fooOvl(short i) _diagnose_if(i, "short bad i", "error"); // expected-note 2{{short bad i}}
+
+  void barOvl(int i) _diagnose_if(i != T(), "int bad i", "error"); // expected-note 2{{int bad i}}
+  void barOvl(short i) _diagnose_if(i != T(), "short bad i", "error"); // expected-note 2{{short bad i}}
+};
+
+void runErrors() {
+  Errors<int>().foo(0);
+  Errors<int>().foo(1); // expected-error{{bad i}}
+
+  Errors<int>().bar(0);
+  Errors<int>().bar(1); // expected-error{{bad i}}
+
+  Errors<int>().fooOvl(0);
+  Errors<int>().fooOvl(1); // expected-error{{call to unavailable}}
+  Errors<int>().fooOvl(short(0));
+  Errors<int>().fooOvl(short(1)); // expected-error{{call to unavailable}}
+
+  Errors<int>().barOvl(0);
+  Errors<int>().barOvl(1); // expected-error{{call to unavailable}}
+  Errors<int>().barOvl(short(0));
+  Errors<int>().barOvl(short(1)); // expected-error{{call to unavailable}}
+}
+
+template <typename T>
+struct Warnings {
+  void foo(int i) _diagnose_if(i, "bad i", "warning"); // expected-note{{from 'diagnose_if'}}
+  void bar(int i) _diagnose_if(i != T(), "bad i", "warning"); // expected-note{{from 'diagnose_if'}}
+
+  void fooOvl(int i) _diagnose_if(i, "int bad i", "warning"); // expected-note{{from 'diagnose_if'}}
+  void fooOvl(short i) _diagnose_if(i, "short bad i", "warning"); // expected-note{{from 'diagnose_if'}}
+
+  void barOvl(int i) _diagnose_if(i != T(), "int bad i", "warning"); // expected-note{{from 'diagnose_if'}}
+  void barOvl(short i) _diagnose_if(i != T(), "short bad i", "warning"); // expected-note{{from 'diagnose_if'}}
+};
+
+void runWarnings() {
+  Warnings<int>().foo(0);
+  Warnings<int>().foo(1); // expected-warning{{bad i}}
+
+  Warnings<int>().bar(0);
+  Warnings<int>().bar(1); // expected-warning{{bad i}}
+
+  Warnings<int>().fooOvl(0);
+  Warnings<int>().fooOvl(1); // expected-warning{{int bad i}}
+  Warnings<int>().fooOvl(short(0));
+  Warnings<int>().fooOvl(short(1)); // expected-warning{{short bad i}}
+
+  Warnings<int>().barOvl(0);
+  Warnings<int>().barOvl(1); // expected-warning{{int bad i}}
+  Warnings<int>().barOvl(short(0));
+  Warnings<int>().barOvl(short(1)); // expected-warning{{short bad i}}
+}
+}
+
+namespace template_specialization {
+template <typename T>
+struct Foo {
+  void foo() _diagnose_if(1, "override me", "error"); // expected-note{{from 'diagnose_if'}}
+  void bar(int i) _diagnose_if(i, "bad i", "error"); // expected-note{{from 'diagnose_if'}}
+  void baz(int i);
+};
+
+template <>
+struct Foo<int> {
+  void foo();
+  void bar(int i);
+  void baz(int i) _diagnose_if(i, "bad i", "error"); // expected-note{{from 'diagnose_if'}}
+};
+
+void runAll() {
+  Foo<double>().foo(); // expected-error{{override me}}
+  Foo<int>().foo();
+
+  Foo<double>().bar(1); // expected-error{{bad i}}
+  Foo<int>().bar(1);
+
+  Foo<double>().baz(1);
+  Foo<int>().baz(1); // expected-error{{bad i}}
+}
+}
+
+namespace late_constexpr {
+constexpr int foo();
+constexpr int foo(int a);
+
+void bar() _diagnose_if(foo(), "bad foo", "error"); // expected-note{{from 'diagnose_if'}} expected-note{{not viable: requires 0 arguments}}
+void bar(int a) _diagnose_if(foo(a), "bad foo", "error"); // expected-note{{bad foo}}
+
+void early() {
+  bar();
+  bar(0);
+  bar(1);
+}
+
+constexpr int foo() { return 1; }
+constexpr int foo(int a) { return a; }
+
+void late() {
+  bar(); // expected-error{{bad foo}}
+  bar(0);
+  bar(1); // expected-error{{call to unavailable function}}
+}
+}
+
+namespace late_parsed {
+struct Foo {
+  int i;
+  constexpr Foo(int i): i(i) {}
+  constexpr bool isFooable() const { return i; }
+
+  void go() const _diagnose_if(isFooable(), "oh no", "error") {} // expected-note{{from 'diagnose_if'}}
+  operator int() const _diagnose_if(isFooable(), "oh no", "error") { return 1; } // expected-note{{oh no}}
+
+  void go2() const _diagnose_if(isFooable(), "oh no", "error") // expected-note{{oh no}}
+      __attribute__((enable_if(true, ""))) {}
+  void go2() const _diagnose_if(isFooable(), "oh no", "error") {} // expected-note{{oh no}}
+
+  constexpr int go3() const _diagnose_if(isFooable(), "oh no", "error")
+      __attribute__((enable_if(true, ""))) {
+    return 1;
+  }
+
+  constexpr int go4() const _diagnose_if(isFooable(), "oh no", "error") {
+    return 1;
+  }
+  constexpr int go4() const _diagnose_if(isFooable(), "oh no", "error")
+      __attribute__((enable_if(true, ""))) {
+    return 1;
+  }
+
+  // We hope to support emitting these errors in the future. For now, though...
+  constexpr int runGo() const {
+    return go3() + go4();
+  }
+};
+
+void go(const Foo &f) _diagnose_if(f.isFooable(), "oh no", "error") {} // expected-note{{oh no}}
+
+void run() {
+  Foo(0).go();
+  Foo(1).go(); // expected-error{{oh no}}
+
+  (void)int(Foo(0));
+  (void)int(Foo(1)); // expected-error{{uses deleted function}}
+
+  Foo(0).go2();
+  Foo(1).go2(); // expected-error{{call to unavailable member function}}
+
+  go(Foo(0));
+  go(Foo(1)); // expected-error{{call to unavailable function}}
+}
+}
+
+namespace member_templates {
+struct Foo {
+  int i;
+  constexpr Foo(int i): i(i) {}
+  constexpr bool bad() const { return i; }
+
+  template <typename T> T getVal() _diagnose_if(bad(), "oh no", "error") { // expected-note{{oh no}}
+    return T();
+  }
+
+  template <typename T>
+  constexpr T getVal2() const _diagnose_if(bad(), "oh no", "error") { // expected-note{{oh no}}
+    return T();
+  }
+
+  template <typename T>
+  constexpr operator T() const _diagnose_if(bad(), "oh no", "error") { // expected-note{{oh no}}
+    return T();
+  }
+
+  // We hope to support emitting these errors in the future.
+  int run() { return getVal<int>() + getVal2<int>() + int(*this); }
+};
+
+void run() {
+  Foo(0).getVal<int>();
+  Foo(1).getVal<int>(); // expected-error{{call to unavailable member function}}
+
+  Foo(0).getVal2<int>();
+  Foo(1).getVal2<int>(); // expected-error{{call to unavailable member function}}
+
+  (void)int(Foo(0));
+  (void)int(Foo(1)); // expected-error{{uses deleted function}}
+}
+}
+
+namespace special_member_operators {
+struct Bar { int j; };
+struct Foo {
+  int i;
+  constexpr Foo(int i): i(i) {}
+  constexpr bool bad() const { return i; }
+  const Bar *operator->() const _diagnose_if(bad(), "oh no", "error") { // expected-note{{oh no}}
+    return nullptr;
+  }
+  void operator()() const _diagnose_if(bad(), "oh no", "error") {} // expected-note{{oh no}}
+};
+
+struct ParenOverload {
+  int i;
+  constexpr ParenOverload(int i): i(i) {}
+  constexpr bool bad() const { return i; }
+  void operator()(double) const _diagnose_if(bad(), "oh no", "error") {} // expected-note 2{{oh no}}
+  void operator()(int) const _diagnose_if(bad(), "oh no", "error") {} // expected-note 2{{oh no}}
+};
+
+struct ParenTemplate {
+  int i;
+  constexpr ParenTemplate(int i): i(i) {}
+  constexpr bool bad() const { return i; }
+  template <typename T>
+  void operator()(T) const _diagnose_if(bad(), "oh no", "error") {} // expected-note 2{{oh no}}
+};
+
+void run() {
+  (void)Foo(0)->j;
+  (void)Foo(1)->j; // expected-error{{selected unavailable operator '->'}}
+
+  Foo(0)();
+  Foo(1)(); // expected-error{{unavailable function call operator}}
+
+  ParenOverload(0)(1);
+  ParenOverload(0)(1.);
+
+  ParenOverload(1)(1); // expected-error{{unavailable function call operator}}
+  ParenOverload(1)(1.); // expected-error{{unavailable function call operator}}
+
+  ParenTemplate(0)(1);
+  ParenTemplate(0)(1.);
+
+  ParenTemplate(1)(1); // expected-error{{unavailable function call operator}}
+  ParenTemplate(1)(1.); // expected-error{{unavailable function call operator}}
+}
+
+void runLambda() {
+  auto L1 = [](int i) _diagnose_if(i, "oh no", "error") {}; // expected-note{{oh no}} expected-note{{conversion candidate}}
+  L1(0);
+  L1(1); // expected-error{{call to unavailable function call}}
+}
+}
+
+namespace ctors {
+struct Foo {
+  int I;
+  constexpr Foo(int I): I(I) {}
+
+  constexpr const Foo &operator=(const Foo &) const // expected-note 2{{disabled: oh no}}
+      _diagnose_if(I, "oh no", "error") {
+    return *this;
+  }
+
+  constexpr const Foo &operator=(const Foo &&) const // expected-note{{disabled: oh no}} expected-note{{no known conversion}}
+      _diagnose_if(I, "oh no", "error") {
+    return *this;
+  }
+};
+
+void run() {
+  constexpr Foo F{0};
+  constexpr Foo F2{1};
+
+  F2 = F; // expected-error{{selected unavailable operator}}
+  F2 = Foo{2}; // expected-error{{selected unavailable operator}}
+}
+}
diff --git a/test/SemaCXX/enable_if.cpp b/test/SemaCXX/enable_if.cpp
index 0f8fc9b2652a..eababc34d370 100644
--- a/test/SemaCXX/enable_if.cpp
+++ b/test/SemaCXX/enable_if.cpp
@@ -464,3 +464,11 @@ void runFoo() {
   Foo<double>().bar(1);
 }
 }
+
+namespace instantiate_constexpr_in_enable_if {
+  template<typename T> struct X {
+    static constexpr bool ok() { return true; }
+    void f() __attribute__((enable_if(ok(), "")));
+  };
+  void g() { X<int>().f(); }
+}
diff --git a/test/SemaCXX/implicit-exception-spec.cpp b/test/SemaCXX/implicit-exception-spec.cpp
index 12871b8ce707..fc86d1810ba5 100644
--- a/test/SemaCXX/implicit-exception-spec.cpp
+++ b/test/SemaCXX/implicit-exception-spec.cpp
@@ -16,34 +16,32 @@ namespace InClassInitializers {
   // Noexcept::Noexcept is not declared constexpr, therefore noexcept(Noexcept())
   // is false.
   bool ThrowSomething() noexcept(false);
-  struct ConstExpr { // expected-error {{default member initializer for 'b' needed}}
-    bool b = noexcept(ConstExpr()) && ThrowSomething(); // expected-note {{declared here}}
-  // expected-note@-1 {{implicit default constructor for 'InClassInitializers::ConstExpr' first required here}}
+  struct ConstExpr {
+    bool b = // expected-note {{declared here}}
+      noexcept(ConstExpr()) && ThrowSomething(); // expected-error {{default member initializer for 'b' needed}}
   };
 
   // Much more obviously broken: we can't parse the initializer without already
   // knowing whether it produces a noexcept expression.
-  struct TemplateArg { // expected-error {{default member initializer for 'n' needed}}
-    int n = ExceptionIf<noexcept(TemplateArg())>::f(); // expected-note {{declared here}}
-    // expected-note@-1 {{implicit default constructor for 'InClassInitializers::TemplateArg' first required here}}
+  struct TemplateArg {
+    int n = // expected-note {{declared here}}
+      ExceptionIf<noexcept(TemplateArg())>::f(); // expected-error {{default member initializer for 'n' needed}}
   };
 
   // And within a nested class.
-  struct Nested { // expected-note {{implicit default constructor for 'InClassInitializers::Nested::Inner' first required here}}
-    struct Inner { // expected-error {{default member initializer for 'n' needed}}
+  struct Nested {
+    struct Inner {
       int n = // expected-note {{declared here}}
-        ExceptionIf<noexcept(Nested())>::f(); // expected-note {{implicit default constructor for 'InClassInitializers::Nested' first required here}}
-    } inner;
+        ExceptionIf<noexcept(Nested())>::f();
+    } inner; // expected-error {{default member initializer for 'n' needed}}
   };
 
-  struct Nested2 { // expected-error {{implicit default constructor for 'InClassInitializers::Nested2' must explicitly initialize the member 'inner' which does not have a default constructor}}
+  struct Nested2 {
     struct Inner;
-    int n = Inner().n; // expected-note {{implicit default constructor for 'InClassInitializers::Nested2::Inner' first required here}}
-    struct Inner { // expected-error {{initializer for 'n' needed}} expected-note {{declared here}}
-      // expected-note@+1 {{declared here}}
-      int n = ExceptionIf<noexcept(Nested2())>::f();
-      // expected-note@-1 {{implicit default constructor for 'InClassInitializers::Nested2' first required here}}
-    } inner; // expected-note {{member is declared here}}
+    int n = Inner().n; // expected-error {{initializer for 'n' needed}}
+    struct Inner {
+      int n = ExceptionIf<noexcept(Nested2())>::f(); // expected-note {{declared here}}
+    } inner;
   };
 }
 
diff --git a/test/SemaCXX/libstdcxx_gets_hack.cpp b/test/SemaCXX/libstdcxx_gets_hack.cpp
new file mode 100644
index 000000000000..0d915d01474c
--- /dev/null
+++ b/test/SemaCXX/libstdcxx_gets_hack.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -fsyntax-only %s -std=c++14 -verify
+
+// This is a test for an egregious hack in Clang that works around
+// an issue with libstdc++'s detection of whether glibc provides a
+// ::gets function. If there is no ::gets, ignore
+//   using ::gets;
+// in namespace std.
+//
+// See PR18402 and gcc.gnu.org/PR77795 for more details.
+
+#ifdef BE_THE_HEADER
+
+#pragma GCC system_header
+namespace std {
+  using ::gets;
+  using ::getx; // expected-error {{no member named 'getx'}}
+}
+
+#else
+
+#define BE_THE_HEADER
+#include "libstdcxx_pointer_return_false_hack.cpp"
+
+namespace foo {
+  using ::gets; // expected-error {{no member named 'gets'}}
+}
+
+#endif
diff --git a/test/SemaCXX/member-init.cpp b/test/SemaCXX/member-init.cpp
index 105c2e49822f..c296baa5bce7 100644
--- a/test/SemaCXX/member-init.cpp
+++ b/test/SemaCXX/member-init.cpp
@@ -13,10 +13,10 @@ public:
 
 bool b();
 int k;
-struct Recurse { // expected-error {{initializer for 'n' needed}}
+struct Recurse {
   int &n = // expected-note {{declared here}}
       b() ?
-      Recurse().n : // expected-note {{implicit default constructor for 'Recurse' first required here}}
+      Recurse().n : // expected-error {{initializer for 'n' needed}}
       k;
 };
 
@@ -128,21 +128,19 @@ A::A() {}
 namespace template_default_ctor {
 struct A {
   template <typename T>
-  struct B { // expected-error {{initializer for 'm1' needed}}
+  struct B {
     int m1 = 0; // expected-note {{declared here}}
   };
-  // expected-note@+1 {{implicit default constructor for 'template_default_ctor::A::B<int>' first required here}}
-  enum { NOE = noexcept(B<int>()) };
+  enum { NOE = noexcept(B<int>()) }; // expected-error {{initializer for 'm1' needed}}
 };
 }
 
 namespace default_ctor {
 struct A {
-  struct B { // expected-error {{initializer for 'm1' needed}}
+  struct B {
     int m1 = 0; // expected-note {{declared here}}
   };
-  // expected-note@+1 {{implicit default constructor for 'default_ctor::A::B' first required here}}
-  enum { NOE = noexcept(B()) };
+  enum { NOE = noexcept(B()) }; // expected-error {{initializer for 'm1' needed}}
 };
 }
 
@@ -150,19 +148,17 @@ namespace member_template {
 struct A {
   template <typename T>
   struct B {
-    struct C { // expected-error {{initializer for 'm1' needed}}
+    struct C {
       int m1 = 0; // expected-note {{declared here}}
     };
     template <typename U>
-    struct D { // expected-error {{initializer for 'm1' needed}}
+    struct D {
       int m1 = 0; // expected-note {{declared here}}
     };
   };
   enum {
-    // expected-note@+1 {{implicit default constructor for 'member_template::A::B<int>::C' first required here}}
-    NOE1 = noexcept(B<int>::C()),
-    // expected-note@+1 {{implicit default constructor for 'member_template::A::B<int>::D<int>' first required here}}
-    NOE2 = noexcept(B<int>::D<int>())
+    NOE1 = noexcept(B<int>::C()), // expected-error {{initializer for 'm1' needed}}
+    NOE2 = noexcept(B<int>::D<int>()) // expected-error {{initializer for 'm1' needed}}
   };
 };
 }
diff --git a/test/SemaCXX/overload-call.cpp b/test/SemaCXX/overload-call.cpp
index 3a01bf24b31a..0e3a9ee50bb2 100644
--- a/test/SemaCXX/overload-call.cpp
+++ b/test/SemaCXX/overload-call.cpp
@@ -338,7 +338,7 @@ namespace PR5756 {
 
 // Tests the exact text used to note the candidates
 namespace test1 {
-  template <class T> void foo(T t, unsigned N); // expected-note {{candidate function [with T = int] not viable: no known conversion from 'const char [6]' to 'unsigned int' for 2nd argument}}
+  template <class T> void foo(T t, unsigned N); // expected-note {{candidate function not viable: no known conversion from 'const char [6]' to 'unsigned int' for 2nd argument}}
   void foo(int n, char N); // expected-note {{candidate function not viable: no known conversion from 'const char [6]' to 'char' for 2nd argument}} 
   void foo(int n, const char *s, int t); // expected-note {{candidate function not viable: requires 3 arguments, but 2 were provided}}
   void foo(int n, const char *s, int t, ...); // expected-note {{candidate function not viable: requires at least 3 arguments, but 2 were provided}}
diff --git a/test/SemaCXX/overload-member-call.cpp b/test/SemaCXX/overload-member-call.cpp
index e0f34d937f6f..6e64b25d6b53 100644
--- a/test/SemaCXX/overload-member-call.cpp
+++ b/test/SemaCXX/overload-member-call.cpp
@@ -70,7 +70,7 @@ void test_X2(X2 *x2p, const X2 *cx2p) {
 // Tests the exact text used to note the candidates
 namespace test1 {
   class A {
-    template <class T> void foo(T t, unsigned N); // expected-note {{candidate function [with T = int] not viable: no known conversion from 'const char [6]' to 'unsigned int' for 2nd argument}}
+    template <class T> void foo(T t, unsigned N); // expected-note {{candidate function not viable: no known conversion from 'const char [6]' to 'unsigned int' for 2nd argument}}
     void foo(int n, char N); // expected-note {{candidate function not viable: no known conversion from 'const char [6]' to 'char' for 2nd argument}} 
     void foo(int n, const char *s, int t); // expected-note {{candidate function not viable: requires 3 arguments, but 2 were provided}}
     void foo(int n, const char *s, int t, ...); // expected-note {{candidate function not viable: requires at least 3 arguments, but 2 were provided}}
diff --git a/test/SemaCXX/undefined-internal.cpp b/test/SemaCXX/undefined-internal.cpp
index 59e6fdf9af06..32151b71ea17 100644
--- a/test/SemaCXX/undefined-internal.cpp
+++ b/test/SemaCXX/undefined-internal.cpp
@@ -186,10 +186,15 @@ namespace OverloadUse {
   namespace {
     void f();
     void f(int); // expected-warning {{function 'OverloadUse::(anonymous namespace)::f' has internal linkage but is not defined}}
+    void f(int, int); // expected-warning {{function 'OverloadUse::(anonymous namespace)::f' has internal linkage but is not defined}}
+  }
+  template<void x()> void t() { x(); }
+  template<void x(int)> void t(int*) { x(10); }
+  template<void x(int, int)> void t(int*, int*) {}
+  void g(int n) {
+    t<f>(&n); // expected-note {{used here}}
+    t<f>(&n, &n); // expected-note {{used here}}
   }
-  template<void x()> void t(int*) { x(); }
-  template<void x(int)> void t(long*) { x(10); } // expected-note {{used here}}
-  void g() { long a; t<f>(&a); }
 }
 
 namespace test7 {
diff --git a/test/SemaTemplate/alias-templates.cpp b/test/SemaTemplate/alias-templates.cpp
index bcdc84e19f7e..d70e86817849 100644
--- a/test/SemaTemplate/alias-templates.cpp
+++ b/test/SemaTemplate/alias-templates.cpp
@@ -244,3 +244,13 @@ namespace redecl {
   template<typename = void> using A = int;
   A<> a; // ok
 }
+
+namespace PR31514 {
+  template<typename T, typename> using EnableTupleSize = T;
+
+  template<typename T> struct tuple_size { static const int value = 0; };
+  template<typename T> struct tuple_size<EnableTupleSize<const T, decltype(tuple_size<T>::value)>> {};
+  template<typename T> struct tuple_size<EnableTupleSize<volatile T, decltype(tuple_size<T>::value)>> {};
+
+  tuple_size<const int> t;
+}
diff --git a/test/SemaTemplate/constexpr-instantiate.cpp b/test/SemaTemplate/constexpr-instantiate.cpp
index e8e3e7dd5a08..dfb8a07d3b7d 100644
--- a/test/SemaTemplate/constexpr-instantiate.cpp
+++ b/test/SemaTemplate/constexpr-instantiate.cpp
@@ -77,20 +77,19 @@ namespace Reference {
 }
 
 namespace Unevaluated {
-  // We follow g++ in treating any reference to a constexpr function template
-  // specialization as requiring an instantiation, even if it occurs in an
-  // unevaluated context.
+  // We follow the current proposed resolution of core issue 1581: a constexpr
+  // function template specialization requires a definition if:
+  //  * it is odr-used, or would be odr-used except that it appears within the
+  //    definition of a template, or
+  //  * it is used within a braced-init-list, where it may be necessary for
+  //    detecting narrowing conversions.
   //
-  // We go slightly further than g++, and also trigger the implicit definition
-  // of a defaulted special member in the same circumstances. This seems scary,
-  // since a lot of classes have constexpr special members in C++11, but the
-  // only observable impact should be the implicit instantiation of constexpr
-  // special member templates (defaulted special members should only be
-  // generated if they are well-formed, and non-constexpr special members in a
-  // base or member cause the class's special member to not be constexpr).
+  // We apply this both for instantiating constexpr function template
+  // specializations and for implicitly defining defaulted constexpr special
+  // member functions.
   //
-  // FIXME: None of this is required by the C++ standard. The rules in this
-  //        area are poorly specified, so this is subject to change.
+  // FIXME: None of this is required by the C++ standard yet. The rules in this
+  //        area are subject to change.
   namespace NotConstexpr {
     template<typename T> struct S {
       S() : n(0) {}
@@ -98,16 +97,35 @@ namespace Unevaluated {
       int n;
     };
     struct U : S<int> {};
-    decltype(U(U())) u; // ok, don't instantiate S<int>::S() because it wasn't declared constexpr
+    decltype(U(U())) u;
   }
   namespace Constexpr {
     template<typename T> struct S {
       constexpr S() : n(0) {}
-      constexpr S(const S&) : n(T::error) {} // expected-error {{has no members}}
+      constexpr S(const S&) : n(T::error) {}
       int n;
     };
-    struct U : S<int> {}; // expected-note {{instantiation}}
-    decltype(U(U())) u; // expected-note {{here}}
+    struct U : S<int> {};
+    decltype(U(U())) u;
+  }
+  namespace ConstexprList {
+    template<int N> struct S {
+      constexpr S() : n(0) {
+        static_assert(N >= 0, "");
+      }
+      constexpr operator int() const { return 0; }
+      int n;
+    };
+    struct U : S<0> {};
+    // ok, trigger instantiation within a list
+    decltype(char{U()}) t0;
+    decltype(new char{S<1>()}) t1; // expected-warning {{side effects}}
+    decltype((char){S<2>()}) t2;
+    decltype(+(char[1]){{S<3>()}}) t3;
+    // do not trigger instantiation outside a list
+    decltype(char(S<-1>())) u1;
+    decltype(new char(S<-2>())) u2; // expected-warning {{side effects}}
+    decltype((char)(S<-3>())) u3;
   }
 
   namespace PR11851_Comment0 {
@@ -190,6 +208,32 @@ namespace Unevaluated {
       constexpr duration max = duration();
     }
   }
+
+  // For variables, we instantiate when they are used in a context in which
+  // evaluation could be required (odr-used, used in a template whose
+  // instantiations would odr-use, or used in list initialization), if they
+  // can be used as a constant (const integral or constexpr).
+  namespace Variables {
+    template<int N> struct A {
+      static const int k;
+      static int n;
+    };
+    template<const int *N> struct B {};
+    template<int N> constexpr int A<N>::k = *(int[N]){N}; // expected-error 1+{{negative}}
+    template<int N> int A<N>::n = *(int[N]){0};
+
+    template <typename> void f() {
+      (void)A<-1>::n; // ok
+      (void)A<-1>::k; // expected-note {{instantiation of }}
+      B<&A<-2>::n> b1; // ok
+      B<&A<-2>::k> b2; // expected-note {{instantiation of }}
+    };
+
+    decltype(A<-3>::k) d1 = 0; // ok
+    decltype(char{A<-4>::k}) d2 = 0; // expected-note {{instantiation of }} expected-error {{narrow}} expected-note {{cast}}
+    decltype(char{A<1>::k}) d3 = 0; // ok
+    decltype(char{A<1 + (unsigned char)-1>::k}) d4 = 0; // expected-error {{narrow}} expected-note {{cast}}
+  }
 }
 
 namespace NoInstantiationWhenSelectingOverload {
@@ -201,10 +245,17 @@ namespace NoInstantiationWhenSelectingOverload {
     int n;
   };
 
-  int f(S);
-  int f(int);
+  constexpr int f(S) { return 0; }
+  constexpr int f(int) { return 0; }
 
   void g() { f(0); }
-  void h() { (void)sizeof(f(0)); }
-  void i() { (void)sizeof(f("oops")); } // expected-note {{instantiation of}}
+  void h() { (void)sizeof(char{f(0)}); }
+  void i() { (void)sizeof(char{f("oops")}); } // expected-note {{instantiation of}}
+}
+
+namespace PR20090 {
+  template <typename T> constexpr T fact(T n) {
+    return n == 0 ? 1 : [=] { return n * fact(n - 1); }();
+  }
+  static_assert(fact(0) == 1, "");
 }
diff --git a/test/SemaTemplate/deduction.cpp b/test/SemaTemplate/deduction.cpp
index 2275a8b3b7ad..0c0e7d599ccb 100644
--- a/test/SemaTemplate/deduction.cpp
+++ b/test/SemaTemplate/deduction.cpp
@@ -342,7 +342,7 @@ namespace deduction_substitution_failure {
 
   template<typename T, typename U> struct A {};
   template<typename T> struct A<T, typename Fail<T>::error> {}; // expected-note {{instantiation of}}
-  A<int, int> ai; // expected-note {{during template argument deduction for class template partial specialization 'A<T, typename Fail<T>::error>' [with T = int]}}
+  A<int, int> ai; // expected-note {{during template argument deduction for class template partial specialization 'A<T, typename Fail<T>::error>' [with T = int]}} expected-note {{in instantiation of template class 'deduction_substitution_failure::A<int, int>'}}
 
   template<typename T, typename U> int B; // expected-warning 0-1 {{extension}}
   template<typename T> int B<T, typename Fail<T>::error> {}; // expected-note {{instantiation of}}
@@ -350,17 +350,39 @@ namespace deduction_substitution_failure {
 }
 
 namespace deduction_after_explicit_pack {
-  template<typename ...T, typename U> int *f(T ...t, int &r, U *u) { // expected-note {{couldn't infer template argument 'U'}}
+  template<typename ...T, typename U> int *f(T ...t, int &r, U *u) {
     return u;
   }
   template<typename U, typename ...T> int *g(T ...t, int &r, U *u) {
     return u;
   }
   void h(float a, double b, int c) {
-    // FIXME: Under DR1388, this appears to be valid.
-    f<float&, double&>(a, b, c, &c); // expected-error {{no matching}}
+    f<float&, double&>(a, b, c, &c); // ok
     g<int, float&, double&>(a, b, c, &c); // ok
   }
+
+  template<class... ExtraArgs>
+  int test(ExtraArgs..., unsigned vla_size, const char *input);
+  int n = test(0, "");
+
+  template <typename... T> void i(T..., int, T..., ...); // expected-note 5{{deduced conflicting}}
+  void j() {
+    i(0);
+    i(0, 1); // expected-error {{no match}}
+    i(0, 1, 2); // expected-error {{no match}}
+    i<>(0);
+    i<>(0, 1); // expected-error {{no match}}
+    i<>(0, 1, 2); // expected-error {{no match}}
+    i<int, int>(0, 1, 2, 3, 4);
+    i<int, int>(0, 1, 2, 3, 4, 5); // expected-error {{no match}}
+  }
+
+  // GCC alarmingly accepts this by deducing T={int} by matching the second
+  // parameter against the first argument, then passing the first argument
+  // through the first parameter.
+  template<typename... T> struct X { X(int); operator int(); };
+  template<typename... T> void p(T..., X<T...>, ...); // expected-note {{deduced conflicting}}
+  void q() { p(X<int>(0), 0); } // expected-error {{no match}}
 }
 
 namespace overload_vs_pack {
diff --git a/test/SemaTemplate/default-arguments-cxx0x.cpp b/test/SemaTemplate/default-arguments-cxx0x.cpp
index c52899a8e6d1..d9fa2b4a825e 100644
--- a/test/SemaTemplate/default-arguments-cxx0x.cpp
+++ b/test/SemaTemplate/default-arguments-cxx0x.cpp
@@ -50,6 +50,8 @@ namespace PR16975 {
     bar(T);
   };
 
+  bar<> foo{0};
+
   struct baz : public bar<> {
     using bar::bar;
   };
diff --git a/test/SemaTemplate/instantiate-init.cpp b/test/SemaTemplate/instantiate-init.cpp
index e9be60d16c1f..244e94f6d605 100644
--- a/test/SemaTemplate/instantiate-init.cpp
+++ b/test/SemaTemplate/instantiate-init.cpp
@@ -115,9 +115,8 @@ namespace PR13064 {
   struct A { explicit A(int); }; // expected-note{{here}}
   template<typename T> struct B { T a { 0 }; };
   B<A> b;
-  // expected-note@+1 {{in instantiation of default member initializer}}
   template<typename T> struct C { T a = { 0 }; }; // expected-error{{explicit}}
-  C<A> c; // expected-note{{here}}
+  C<A> c; // expected-note {{in instantiation of default member initializer}}
 }
 
 namespace PR16903 {
diff --git a/test/SemaTemplate/temp_arg_nontype.cpp b/test/SemaTemplate/temp_arg_nontype.cpp
index 93f11b5657d0..27a0a03f84f4 100644
--- a/test/SemaTemplate/temp_arg_nontype.cpp
+++ b/test/SemaTemplate/temp_arg_nontype.cpp
@@ -173,12 +173,16 @@ namespace pr6249 {
 }
 
 namespace PR6723 {
-  template<unsigned char C> void f(int (&a)[C]); // expected-note {{candidate template ignored}} \
-  // expected-note{{substitution failure [with C = '\x00']}}
+  template<unsigned char C> void f(int (&a)[C]); // expected-note 3{{candidate template ignored: substitution failure [with C = '\x00']}}
+  // expected-note@-1 {{not viable: no known conversion from 'int [512]' to 'int (&)[0]'}}
   void g() {
     int arr512[512];
     f(arr512); // expected-error{{no matching function for call}}
     f<512>(arr512); // expected-error{{no matching function for call}}
+
+    int arr0[0];
+    f(arr0); // expected-error{{no matching function for call}}
+    f<0>(arr0); // expected-error{{no matching function for call}}
   }
 }
 
diff --git a/tools/c-index-test/core_main.cpp b/tools/c-index-test/core_main.cpp
index 8976d9134916..0ab24fb6ccb9 100644
--- a/tools/c-index-test/core_main.cpp
+++ b/tools/c-index-test/core_main.cpp
@@ -166,6 +166,8 @@ static bool printSourceSymbols(ArrayRef<const char *> Args) {
 
 static void printSymbolInfo(SymbolInfo SymInfo, raw_ostream &OS) {
   OS << getSymbolKindString(SymInfo.Kind);
+  if (SymInfo.SubKind != SymbolSubKind::None)
+    OS << '/' << getSymbolSubKindString(SymInfo.SubKind);
   if (SymInfo.Properties) {
     OS << '(';
     printSymbolProperties(SymInfo.Properties, OS);
diff --git a/tools/driver/CMakeLists.txt b/tools/driver/CMakeLists.txt
index 49bde947f4c6..f6e26fa11f41 100644
--- a/tools/driver/CMakeLists.txt
+++ b/tools/driver/CMakeLists.txt
@@ -72,7 +72,7 @@ endforeach()
 
 # Configure plist creation for OS X.
 set (TOOL_INFO_PLIST "Info.plist" CACHE STRING "Plist name")
-if (APPLE)  
+if (APPLE)
   if (CLANG_VENDOR)
     set(TOOL_INFO_NAME "${CLANG_VENDOR} clang")
   else()
@@ -82,20 +82,19 @@ if (APPLE)
   set(TOOL_INFO_UTI "${CLANG_VENDOR_UTI}")
   set(TOOL_INFO_VERSION "${CLANG_VERSION}")
   set(TOOL_INFO_BUILD_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}")
-  
+
   set(TOOL_INFO_PLIST_OUT "${CMAKE_CURRENT_BINARY_DIR}/${TOOL_INFO_PLIST}")
   target_link_libraries(clang
     "-Wl,-sectcreate,__TEXT,__info_plist,${TOOL_INFO_PLIST_OUT}")
   configure_file("${TOOL_INFO_PLIST}.in" "${TOOL_INFO_PLIST_OUT}" @ONLY)
-  
+
   set(TOOL_INFO_UTI)
   set(TOOL_INFO_NAME)
   set(TOOL_INFO_VERSION)
   set(TOOL_INFO_BUILD_VERSION)
 endif()
 
-# the linker -order_file flag is only supported by ld64
-if(LD64_EXECUTABLE AND CLANG_ORDER_FILE)
+if(CLANG_ORDER_FILE AND (LD64_EXECUTABLE OR GOLD_EXECUTABLE))
   include(CMakePushCheckState)
 
   function(check_linker_flag flag out_var)
@@ -105,9 +104,14 @@ if(LD64_EXECUTABLE AND CLANG_ORDER_FILE)
     cmake_pop_check_state()
   endfunction()
 
+  if (LD64_EXECUTABLE)
+    set(LINKER_ORDER_FILE_OPTION "-Wl,-order_file,${CLANG_ORDER_FILE}")
+  elseif (GOLD_EXECUTABLE)
+    set(LINKER_ORDER_FILE_OPTION "-Wl,--section-ordering-file,${CLANG_ORDER_FILE}")
+  endif()
+
   # This is a test to ensure the actual order file works with the linker.
-  check_linker_flag("-Wl,-order_file,${CLANG_ORDER_FILE}"
-    LINKER_ORDER_FILE_WORKS)
+  check_linker_flag(${LINKER_ORDER_FILE_OPTION} LINKER_ORDER_FILE_WORKS)
 
   # Passing an empty order file disables some linker layout optimizations.
   # To work around this and enable workflows for re-linking when the order file
@@ -117,7 +121,7 @@ if(LD64_EXECUTABLE AND CLANG_ORDER_FILE)
   if("${ORDER_FILE}" STREQUAL "\n")
     set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CLANG_ORDER_FILE})
   elseif(LINKER_ORDER_FILE_WORKS)
-    target_link_libraries(clang "-Wl,-order_file,${CLANG_ORDER_FILE}")
+    target_link_libraries(clang ${LINKER_ORDER_FILE_OPTION})
     set_target_properties(clang PROPERTIES LINK_DEPENDS ${CLANG_ORDER_FILE})
   endif()
 endif()
diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp
index 6f9df680eef5..629e85803d64 100644
--- a/unittests/Format/FormatTest.cpp
+++ b/unittests/Format/FormatTest.cpp
@@ -5780,6 +5780,10 @@ TEST_F(FormatTest, UnderstandsUsesOfStarAndAmp) {
   verifyGoogleFormat("MACRO Constructor(const int& i) : a(a), b(b) {}");
   verifyFormat("void f() { f(a, c * d); }");
   verifyFormat("void f() { f(new a(), c * d); }");
+  verifyFormat("void f(const MyOverride &override);");
+  verifyFormat("void f(const MyFinal &final);");
+  verifyIndependentOfContext("bool a = f() && override.f();");
+  verifyIndependentOfContext("bool a = f() && final.f();");
 
   verifyIndependentOfContext("InvalidRegions[*R] = 0;");
 
diff --git a/unittests/Format/FormatTestJS.cpp b/unittests/Format/FormatTestJS.cpp
index 59f4a4f6dcfe..230717fe47cc 100644
--- a/unittests/Format/FormatTestJS.cpp
+++ b/unittests/Format/FormatTestJS.cpp
@@ -858,6 +858,26 @@ TEST_F(FormatTestJS, AutomaticSemicolonInsertionHeuristic) {
                "return 1",
                "a = null\n"
                "  return   1");
+  verifyFormat(
+      "x = {\n"
+      "  a: 1\n"
+      "}\n"
+      "class Y {}",
+      "  x  =  {a  : 1}\n"
+      "   class  Y {  }");
+}
+
+TEST_F(FormatTestJS, ImportExportASI) {
+  verifyFormat(
+      "import {x} from 'y'\n"
+      "export function z() {}",
+      "import   {x} from 'y'\n"
+      "  export function z() {}");
+  verifyFormat(
+      "export {x}\n"
+      "class Y {}",
+      "  export {x}\n"
+      "  class  Y {\n}");
 }
 
 TEST_F(FormatTestJS, ClosureStyleCasts) {
diff --git a/www/cxx_dr_status.html b/www/cxx_dr_status.html
index ee8ce025eb47..e7d2e5f87c9c 100644
--- a/www/cxx_dr_status.html
+++ b/www/cxx_dr_status.html
@@ -28,7 +28,7 @@
 <!--*************************************************************************-->
 <h1>C++ Defect Report Support in Clang</h1>
 <!--*************************************************************************-->
-<p>Last updated: $Date: 2017-01-02 12:15:42 +0100 (Mon, 02 Jan 2017) $</p>
+<p>Last updated: $Date: 2017-01-09 09:01:21 +0100 (Mon, 09 Jan 2017) $</p>
 
 <h2 id="cxxdr">C++ defect report implementation status</h2>
 
@@ -8143,7 +8143,7 @@ and <I>POD class</I></td>
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1388">1388</a></td>
     <td>CD3</td>
     <td>Missing non-deduced context following a function parameter pack</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="svn" align="center">SVN</td>
   </tr>
   <tr id="1389">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#1389">1389</a></td>
@@ -8161,7 +8161,7 @@ and <I>POD class</I></td>
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1391">1391</a></td>
     <td>DRWP</td>
     <td>Conversions to parameter types with non-deduced template arguments</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1392">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1392">1392</a></td>
@@ -8209,7 +8209,7 @@ and <I>POD class</I></td>
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1399">1399</a></td>
     <td>CD3</td>
     <td>Deduction with multiple function parameter packs</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="svn" align="center">Duplicate of <a href="#1388">1388</a></td>
   </tr>
   <tr id="1400">
     <td><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#1400">1400</a></td>
-- 
cgit v1.2.3