64 files changed, 2787 insertions, 2008 deletions
diff --git a/include/clang-c/Index.h b/include/clang-c/Index.h
index aeda39308f91..91b3d11a549a 100644
--- a/include/clang-c/Index.h
+++ b/include/clang-c/Index.h
@@ -32,7 +32,7 @@
  * compatible, thus CINDEX_VERSION_MAJOR is expected to remain stable.
  */
 #define CINDEX_VERSION_MAJOR 0
-#define CINDEX_VERSION_MINOR 40
+#define CINDEX_VERSION_MINOR 41
 
 #define CINDEX_VERSION_ENCODE(major, minor) ( \
       ((major) * 10000)                       \
@@ -1419,6 +1419,15 @@ CINDEX_LINKAGE int clang_saveTranslationUnit(CXTranslationUnit TU,
                                              unsigned options);
 
 /**
+ * \brief Suspend a translation unit in order to free memory associated with it.
+ *
+ * A suspended translation unit uses significantly less memory but on the other
+ * side does not support any other calls than \c clang_reparseTranslationUnit
+ * to resume it or \c clang_disposeTranslationUnit to dispose it completely.
+ */
+CINDEX_LINKAGE unsigned clang_suspendTranslationUnit(CXTranslationUnit);
+
+/**
  * \brief Destroy the specified CXTranslationUnit object.
  */
 CINDEX_LINKAGE void clang_disposeTranslationUnit(CXTranslationUnit);
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td
index 77db8993f018..b393ce5f1545 100644
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -525,6 +525,8 @@ def err_pp_module_begin_without_module_end : Error<
 def err_pp_module_end_without_module_begin : Error<
   "no matching '#pragma clang module begin' for this "
   "'#pragma clang module end'">;
+def note_pp_module_begin_here : Note<
+  "entering module '%0' due to this pragma">;
 
 def err_defined_macro_name : Error<"'defined' cannot be used as a macro name">;
 def err_paste_at_start : Error<
diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td
index 4934bcfa3890..629e8b837f59 100644
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -8312,8 +8312,13 @@ def err_opencl_bitfields : Error<
   "bit-fields are not supported in OpenCL">;
 def err_opencl_vla : Error<
   "variable length arrays are not supported in OpenCL">;
+def err_opencl_scalar_type_rank_greater_than_vector_type : Error<
+    "scalar operand type has greater rank than the type of the vector "
+    "element. (%0 and %1)">;
 def err_bad_kernel_param_type : Error<
   "%0 cannot be used as the type of a kernel parameter">;
+def err_opencl_implicit_function_decl : Error<
+  "implicit declaration of function %0 is invalid in OpenCL">;
 def err_record_with_pointers_kernel_param : Error<
   "%select{struct|union}0 kernel parameters may not contain pointers">;
 def note_within_field_of_type : Note<
@@ -8744,8 +8749,8 @@ def err_omp_not_mappable_type : Error<
   "type %0 is not mappable to target">;
 def err_omp_invalid_map_type_for_directive : Error<
   "%select{map type '%1' is not allowed|map type must be specified}0 for '#pragma omp %2'">;
-def err_omp_no_map_for_directive : Error<
-  "expected at least one map clause for '#pragma omp %0'">;
+def err_omp_no_clause_for_directive : Error<
+  "expected at least one %0 clause for '#pragma omp %1'">;
 def note_omp_polymorphic_in_target : Note<
   "mappable type cannot be polymorphic">;
 def note_omp_static_member_in_target : Note<
diff --git a/include/clang/Frontend/ASTUnit.h b/include/clang/Frontend/ASTUnit.h
index 7f70609efc97..fb0a5e8acd4a 100644
--- a/include/clang/Frontend/ASTUnit.h
+++ b/include/clang/Frontend/ASTUnit.h
@@ -878,6 +878,11 @@ public:
                ArrayRef<RemappedFile> RemappedFiles = None,
                IntrusiveRefCntPtr<vfs::FileSystem> VFS = nullptr);
 
+  /// \brief Free data that will be re-generated on the next parse.
+  ///
+  /// Preamble-related data is not affected.
+  void ResetForParse();
+
   /// \brief Perform code completion at the given file, line, and
   /// column within this translation unit.
   ///
diff --git a/include/clang/Lex/Preprocessor.h b/include/clang/Lex/Preprocessor.h
index 9f015eaa230b..aeca83a90716 100644
--- a/include/clang/Lex/Preprocessor.h
+++ b/include/clang/Lex/Preprocessor.h
@@ -283,6 +283,44 @@ class Preprocessor {
   /// This is used when loading a precompiled preamble.
   std::pair<int, bool> SkipMainFilePreamble;
 
+  class PreambleConditionalStackStore {
+    enum State {
+      Off = 0,
+      Recording = 1,
+      Replaying = 2,
+    };
+
+  public:
+    PreambleConditionalStackStore() : ConditionalStackState(Off) {}
+
+    void startRecording() { ConditionalStackState = Recording; }
+    void startReplaying() { ConditionalStackState = Replaying; }
+    bool isRecording() const { return ConditionalStackState == Recording; }
+    bool isReplaying() const { return ConditionalStackState == Replaying; }
+
+    ArrayRef<PPConditionalInfo> getStack() const {
+      return ConditionalStack;
+    }
+
+    void doneReplaying() {
+      ConditionalStack.clear();
+      ConditionalStackState = Off;
+    }
+
+    void setStack(ArrayRef<PPConditionalInfo> s) {
+      if (!isRecording() && !isReplaying())
+        return;
+      ConditionalStack.clear();
+      ConditionalStack.append(s.begin(), s.end());
+    }
+
+    bool hasRecordedPreamble() const { return !ConditionalStack.empty(); }
+
+  private:
+    SmallVector<PPConditionalInfo, 4> ConditionalStack;
+    State ConditionalStackState;
+  } PreambleConditionalStack;
+
   /// \brief The current top of the stack that we're lexing from if
   /// not expanding a macro and we are lexing directly from source code.
   ///
@@ -1695,6 +1733,11 @@ public:
   /// \brief Return true if we're in the top-level file, not in a \#include.
   bool isInPrimaryFile() const;
 
+  /// \brief Return true if we're in the main file (specifically, if we are 0
+  /// (zero) levels deep \#include. This is used by the lexer to determine if
+  /// it needs to generate errors about unterminated \#if directives.
+  bool isInMainFile() const;
+
   /// \brief Handle cases where the \#include name is expanded
   /// from a macro as multiple tokens, which need to be glued together. 
   ///
@@ -1932,6 +1975,27 @@ public:
                                                           Module *M,
                                                           SourceLocation MLoc);
 
+  bool isRecordingPreamble() const {
+    return PreambleConditionalStack.isRecording();
+  }
+
+  bool hasRecordedPreamble() const {
+    return PreambleConditionalStack.hasRecordedPreamble();
+  }
+
+  ArrayRef<PPConditionalInfo> getPreambleConditionalStack() const {
+      return PreambleConditionalStack.getStack();
+  }
+
+  void setRecordedPreambleConditionalStack(ArrayRef<PPConditionalInfo> s) {
+    PreambleConditionalStack.setStack(s);
+  }
+
+  void setReplayablePreambleConditionalStack(ArrayRef<PPConditionalInfo> s) {
+    PreambleConditionalStack.startReplaying();
+    PreambleConditionalStack.setStack(s);
+  }
+
 private:
   // Macro handling.
   void HandleDefineDirective(Token &Tok, bool ImmediatelyAfterTopLevelIfndef);
diff --git a/include/clang/Lex/PreprocessorLexer.h b/include/clang/Lex/PreprocessorLexer.h
index 6d6cf05a96c4..5c2e4d41454b 100644
--- a/include/clang/Lex/PreprocessorLexer.h
+++ b/include/clang/Lex/PreprocessorLexer.h
@@ -17,6 +17,7 @@
 
 #include "clang/Lex/MultipleIncludeOpt.h"
 #include "clang/Lex/Token.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 
 namespace clang {
@@ -176,6 +177,11 @@ public:
   conditional_iterator conditional_end() const { 
     return ConditionalStack.end(); 
   }
+
+  void setConditionalLevels(ArrayRef<PPConditionalInfo> CL) {
+    ConditionalStack.clear();
+    ConditionalStack.append(CL.begin(), CL.end());
+  }
 };
 
 }  // end namespace clang
diff --git a/include/clang/Lex/PreprocessorOptions.h b/include/clang/Lex/PreprocessorOptions.h
index 58d79f7ff81a..c85d2384fa47 100644
--- a/include/clang/Lex/PreprocessorOptions.h
+++ b/include/clang/Lex/PreprocessorOptions.h
@@ -80,7 +80,14 @@ public:
   /// The boolean indicates whether the preamble ends at the start of a new
   /// line.
   std::pair<unsigned, bool> PrecompiledPreambleBytes;
-  
+
+  /// \brief True indicates that a preamble is being generated.
+  ///
+  /// When the lexer is done, one of the things that need to be preserved is the
+  /// conditional #if stack, so the ASTWriter/ASTReader can save/restore it when
+  /// processing the rest of the file.
+  bool GeneratePreamble;
+
   /// The implicit PTH input included at the start of the translation unit, or
   /// empty.
   std::string ImplicitPTHInclude;
@@ -144,6 +151,7 @@ public:
                           AllowPCHWithCompilerErrors(false),
                           DumpDeserializedPCHDecls(false),
                           PrecompiledPreambleBytes(0, true),
+                          GeneratePreamble(false),
                           RemappedFilesKeepOriginalName(true),
                           RetainRemappedFileBuffers(false),
                           ObjCXXARCStandardLibrary(ARCXX_nolib) { }
diff --git a/include/clang/Serialization/ASTBitCodes.h b/include/clang/Serialization/ASTBitCodes.h
index 823440b19713..6b40781a1239 100644
--- a/include/clang/Serialization/ASTBitCodes.h
+++ b/include/clang/Serialization/ASTBitCodes.h
@@ -607,6 +607,9 @@ namespace clang {
 
       /// \brief Record code for \#pragma pack options.
       PACK_PRAGMA_OPTIONS = 61,
+
+      /// \brief The stack of open #ifs/#ifdefs recorded in a preamble.
+      PP_CONDITIONAL_STACK = 62,
     };
 
     /// \brief Record types used within a source manager block.
diff --git a/include/clang/StaticAnalyzer/Checkers/Checkers.td b/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 790ba5c121c9..4171c685cb98 100644
--- a/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -279,15 +279,15 @@ def VirtualCallChecker : Checker<"VirtualCall">,
 
 let ParentPackage = CplusplusAlpha in {
 
+def IteratorRangeChecker : Checker<"IteratorRange">,
+  HelpText<"Check for iterators used outside their valid ranges">,
+  DescFile<"IteratorChecker.cpp">;
+
 def MisusedMovedObjectChecker: Checker<"MisusedMovedObject">,
      HelpText<"Method calls on a moved-from object and copying a moved-from "
               "object will be reported">,
      DescFile<"MisusedMovedObjectChecker.cpp">;
 
-def IteratorPastEndChecker : Checker<"IteratorPastEnd">,
-  HelpText<"Check iterators used past end">,
-  DescFile<"IteratorPastEndChecker.cpp">;
-
 } // end: "alpha.cplusplus"
 
 
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h b/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
index 2910ef4212cc..e3a2164b11ff 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
@@ -43,6 +43,7 @@ typedef std::unique_ptr<ConstraintManager>(*ConstraintManagerCreator)(
     ProgramStateManager &, SubEngine *);
 typedef std::unique_ptr<StoreManager>(*StoreManagerCreator)(
     ProgramStateManager &);
+typedef llvm::ImmutableMap<const SubRegion*, TaintTagType> TaintedSubRegions;
 
 //===----------------------------------------------------------------------===//
 // ProgramStateTrait - Traits used by the Generic Data Map of a ProgramState.
@@ -343,6 +344,9 @@ public:
   ProgramStateRef addTaint(const Stmt *S, const LocationContext *LCtx,
                                TaintTagType Kind = TaintTagGeneric) const;
 
+  /// Create a new state in which the value is marked as tainted.
+  ProgramStateRef addTaint(SVal V, TaintTagType Kind = TaintTagGeneric) const;
+
   /// Create a new state in which the symbol is marked as tainted.
   ProgramStateRef addTaint(SymbolRef S,
                                TaintTagType Kind = TaintTagGeneric) const;
@@ -351,6 +355,14 @@ public:
   ProgramStateRef addTaint(const MemRegion *R,
                                TaintTagType Kind = TaintTagGeneric) const;
 
+  /// Create a new state in a which a sub-region of a given symbol is tainted.
+  /// This might be necessary when referring to regions that can not have an
+  /// individual symbol, e.g. if they are represented by the default binding of
+  /// a LazyCompoundVal.
+  ProgramStateRef addPartialTaint(SymbolRef ParentSym,
+                                  const SubRegion *SubRegion,
+                                  TaintTagType Kind = TaintTagGeneric) const;
+
   /// Check if the statement is tainted in the current state.
   bool isTainted(const Stmt *S, const LocationContext *LCtx,
                  TaintTagType Kind = TaintTagGeneric) const;
@@ -453,6 +465,7 @@ private:
   std::unique_ptr<ConstraintManager>   ConstraintMgr;
 
   ProgramState::GenericDataMap::Factory     GDMFactory;
+  TaintedSubRegions::Factory TSRFactory;
 
   typedef llvm::DenseMap<void*,std::pair<void*,void (*)(void*)> > GDMContextsTy;
   GDMContextsTy GDMContexts;
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h b/include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h
index d39b5017d312..7b76263f040c 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h
@@ -35,6 +35,16 @@ template<> struct ProgramStateTrait<TaintMap>
   static void *GDMIndex() { static int index = 0; return &index; }
 };
 
+/// The GDM component mapping derived symbols' parent symbols to their
+/// underlying regions. This is used to efficiently check whether a symbol is
+/// tainted when it represents a sub-region of a tainted symbol.
+struct DerivedSymTaint {};
+typedef llvm::ImmutableMap<SymbolRef, TaintedSubRegions> DerivedSymTaintImpl;
+template<> struct ProgramStateTrait<DerivedSymTaint>
+    :  public ProgramStatePartialTrait<DerivedSymTaintImpl> {
+  static void *GDMIndex() { static int index; return &index; }
+};
+
 class TaintManager {
 
   TaintManager() {}
diff --git a/lib/CodeGen/CGCoroutine.cpp b/lib/CodeGen/CGCoroutine.cpp
index c468c1bb4b5f..f65fb5b9232a 100644
--- a/lib/CodeGen/CGCoroutine.cpp
+++ b/lib/CodeGen/CGCoroutine.cpp
@@ -578,8 +578,7 @@ void CodeGenFunction::EmitCoroutineBody(const CoroutineBodyStmt &S) {
       EmitBlock(FinalBB);
       CurCoro.Data->CurrentAwaitKind = AwaitKind::Final;
       EmitStmt(S.getFinalSuspendStmt());
-    }
-    else {
+    } else {
       // We don't need FinalBB. Emit it to make sure the block is deleted.
       EmitBlock(FinalBB, /*IsFinished=*/true);
     }
diff --git a/lib/Frontend/ASTUnit.cpp b/lib/Frontend/ASTUnit.cpp
index d660638a1e8a..01f7ca8aba9b 100644
--- a/lib/Frontend/ASTUnit.cpp
+++ b/lib/Frontend/ASTUnit.cpp
@@ -1036,8 +1036,6 @@ static void checkAndSanitizeDiags(SmallVectorImpl<StoredDiagnostic> &
 bool ASTUnit::Parse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
                     std::unique_ptr<llvm::MemoryBuffer> OverrideMainBuffer,
                     IntrusiveRefCntPtr<vfs::FileSystem> VFS) {
-  SavedMainFileBuffer.reset();
-
   if (!Invocation)
     return true;
 
@@ -1090,17 +1088,11 @@ bool ASTUnit::Parse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
     Clang->createFileManager();
     FileMgr = &Clang->getFileManager();
   }
-  SourceMgr = new SourceManager(getDiagnostics(), *FileMgr,
-                                UserFilesAreVolatile);
-  TheSema.reset();
-  Ctx = nullptr;
-  PP = nullptr;
-  Reader = nullptr;
 
-  // Clear out old caches and data.
-  TopLevelDecls.clear();
-  clearFileLevelDecls();
+  ResetForParse();
 
+  SourceMgr = new SourceManager(getDiagnostics(), *FileMgr,
+                                UserFilesAreVolatile);
   if (!OverrideMainBuffer) {
     checkAndRemoveNonDriverDiags(StoredDiagnostics);
     TopLevelDeclsInPreamble.clear();
@@ -1999,6 +1991,7 @@ ASTUnit *ASTUnit::LoadFromCommandLine(
   PreprocessorOptions &PPOpts = CI->getPreprocessorOpts();
   PPOpts.RemappedFilesKeepOriginalName = RemappedFilesKeepOriginalName;
   PPOpts.AllowPCHWithCompilerErrors = AllowPCHWithCompilerErrors;
+  PPOpts.GeneratePreamble = PrecompilePreambleAfterNParses != 0;
   
   // Override the resources path.
   CI->getHeaderSearchOpts().ResourceDir = ResourceFilesPath;
@@ -2115,6 +2108,19 @@ bool ASTUnit::Reparse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
   return Result;
 }
 
+void ASTUnit::ResetForParse() {
+  SavedMainFileBuffer.reset();
+
+  SourceMgr.reset();
+  TheSema.reset();
+  Ctx.reset();
+  PP.reset();
+  Reader.reset();
+
+  TopLevelDecls.clear();
+  clearFileLevelDecls();
+}
+
 //----------------------------------------------------------------------------//
 // Code completion
 //----------------------------------------------------------------------------//
diff --git a/lib/Frontend/SerializedDiagnosticPrinter.cpp b/lib/Frontend/SerializedDiagnosticPrinter.cpp
index 7f88c919e24a..b5a5acd8ad9a 100644
--- a/lib/Frontend/SerializedDiagnosticPrinter.cpp
+++ b/lib/Frontend/SerializedDiagnosticPrinter.cpp
@@ -506,7 +506,7 @@ void SDiagsWriter::EmitBlockInfoBlock() {
   Abbrev->Add(BitCodeAbbrevOp(RECORD_FILENAME));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10)); // Mapped file ID.
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Size.
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Modifcation time.  
+  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Modification time.  
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 16)); // Text size.
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // File name text.
   Abbrevs.set(RECORD_FILENAME, Stream.EmitBlockInfoAbbrev(BLOCK_DIAG,
diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h
index 957fd5f65e26..90fd477d9b98 100644
--- a/lib/Headers/altivec.h
+++ b/lib/Headers/altivec.h
@@ -2887,87 +2887,79 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 
 /* vec_ctf */
 
-static __inline__ vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
-  return __builtin_altivec_vcfsx(__a, __b);
-}
-
-static __inline__ vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a,
-                                                    int __b) {
-  return __builtin_altivec_vcfux((vector int)__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector double __ATTRS_o_ai
-vec_ctf(vector unsigned long long __a, int __b) {
-  vector double __ret = __builtin_convertvector(__a, vector double);
-  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
-  return __ret;
-}
-
-static __inline__ vector double __ATTRS_o_ai
-vec_ctf(vector signed long long __a, int __b) {
-  vector double __ret = __builtin_convertvector(__a, vector double);
-  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
-  return __ret;
-}
+#define vec_ctf(__a, __b)                                                      \
+  _Generic((__a), vector int                                                   \
+           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+             vector unsigned int                                               \
+           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)),  \
+             vector unsigned long long                                         \
+           : (__builtin_convertvector((vector unsigned long long)(__a),        \
+                                      vector double) *                         \
+              (vector double)(vector unsigned long long)((0x3ffULL - (__b))    \
+                                                         << 52)),              \
+             vector signed long long                                           \
+           : (__builtin_convertvector((vector signed long long)(__a),          \
+                                      vector double) *                         \
+              (vector double)(vector unsigned long long)((0x3ffULL - (__b))    \
+                                                         << 52)))
+#else
+#define vec_ctf(__a, __b)                                                      \
+  _Generic((__a), vector int                                                   \
+           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+             vector unsigned int                                               \
+           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)))
 #endif
 
 /* vec_vcfsx */
 
-static __inline__ vector float __attribute__((__always_inline__))
-vec_vcfsx(vector int __a, int __b) {
-  return __builtin_altivec_vcfsx(__a, __b);
-}
+#define vec_vcfux __builtin_altivec_vcfux
 
 /* vec_vcfux */
 
-static __inline__ vector float __attribute__((__always_inline__))
-vec_vcfux(vector unsigned int __a, int __b) {
-  return __builtin_altivec_vcfux((vector int)__a, __b);
-}
+#define vec_vcfsx(__a, __b) __builtin_altivec_vcfsx((vector int)(__a), (__b))
 
 /* vec_cts */
 
-static __inline__ vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) {
-  return __builtin_altivec_vctsxs(__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector signed long long __ATTRS_o_ai
-vec_cts(vector double __a, int __b) {
-  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
-  return __builtin_convertvector(__a, vector signed long long);
-}
+#define vec_cts(__a, __b)                                                      \
+  _Generic((__a), vector float                                                 \
+           : __builtin_altivec_vctsxs((__a), (__b)), vector double             \
+           : __extension__({                                                   \
+             vector double __ret =                                             \
+                 (__a) *                                                       \
+                 (vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
+                                                            << 52);            \
+             __builtin_convertvector(__ret, vector signed long long);          \
+           }))
+#else
+#define vec_cts __builtin_altivec_vctsxs
 #endif
 
 /* vec_vctsxs */
 
-static __inline__ vector int __attribute__((__always_inline__))
-vec_vctsxs(vector float __a, int __b) {
-  return __builtin_altivec_vctsxs(__a, __b);
-}
+#define vec_vctsxs __builtin_altivec_vctsxs
 
 /* vec_ctu */
 
-static __inline__ vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a,
-                                                           int __b) {
-  return __builtin_altivec_vctuxs(__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector unsigned long long __ATTRS_o_ai
-vec_ctu(vector double __a, int __b) {
-  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
-  return __builtin_convertvector(__a, vector unsigned long long);
-}
+#define vec_ctu(__a, __b)                                                      \
+  _Generic((__a), vector float                                                 \
+           : __builtin_altivec_vctuxs((__a), (__b)), vector double             \
+           : __extension__({                                                   \
+             vector double __ret =                                             \
+                 (__a) *                                                       \
+                 (vector double)(vector unsigned long long)((0x3ffULL + __b)   \
+                                                            << 52);            \
+             __builtin_convertvector(__ret, vector unsigned long long);        \
+           }))
+#else
+#define vec_ctu __builtin_altivec_vctuxs
 #endif
 
 /* vec_vctuxs */
 
-static __inline__ vector unsigned int __attribute__((__always_inline__))
-vec_vctuxs(vector float __a, int __b) {
-  return __builtin_altivec_vctuxs(__a, __b);
-}
+#define vec_vctuxs __builtin_altivec_vctuxs
 
 /* vec_signed */
 
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 92942fd09a0c..f5a35e97d6e1 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -550,8 +550,6 @@ namespace {
 
   enum PreambleDirectiveKind {
     PDK_Skipped,
-    PDK_StartIf,
-    PDK_EndIf,
     PDK_Unknown
   };
 
@@ -574,8 +572,6 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
 
   bool InPreprocessorDirective = false;
   Token TheTok;
-  Token IfStartTok;
-  unsigned IfCount = 0;
   SourceLocation ActiveCommentLoc;
 
   unsigned MaxLineOffset = 0;
@@ -658,33 +654,18 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
               .Case("sccs", PDK_Skipped)
               .Case("assert", PDK_Skipped)
               .Case("unassert", PDK_Skipped)
-              .Case("if", PDK_StartIf)
-              .Case("ifdef", PDK_StartIf)
-              .Case("ifndef", PDK_StartIf)
+              .Case("if", PDK_Skipped)
+              .Case("ifdef", PDK_Skipped)
+              .Case("ifndef", PDK_Skipped)
               .Case("elif", PDK_Skipped)
               .Case("else", PDK_Skipped)
-              .Case("endif", PDK_EndIf)
+              .Case("endif", PDK_Skipped)
               .Default(PDK_Unknown);
 
         switch (PDK) {
         case PDK_Skipped:
           continue;
 
-        case PDK_StartIf:
-          if (IfCount == 0)
-            IfStartTok = HashTok;
-            
-          ++IfCount;
-          continue;
-            
-        case PDK_EndIf:
-          // Mismatched #endif. The preamble ends here.
-          if (IfCount == 0)
-            break;
-
-          --IfCount;
-          continue;
-            
         case PDK_Unknown:
           // We don't know what this directive is; stop at the '#'.
           break;
@@ -705,16 +686,13 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
   } while (true);
   
   SourceLocation End;
-  if (IfCount)
-    End = IfStartTok.getLocation();
-  else if (ActiveCommentLoc.isValid())
+  if (ActiveCommentLoc.isValid())
     End = ActiveCommentLoc; // don't truncate a decl comment.
   else
     End = TheTok.getLocation();
 
   return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
-                        IfCount? IfStartTok.isAtStartOfLine()
-                               : TheTok.isAtStartOfLine());
+                        TheTok.isAtStartOfLine());
 }
 
 /// AdvanceToTokenCharacter - Given a location that specifies the start of a
@@ -2570,6 +2548,11 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
     return true;
   }
   
+  if (PP->isRecordingPreamble() && !PP->isInMainFile()) {
+    PP->setRecordedPreambleConditionalStack(ConditionalStack);
+    ConditionalStack.clear();
+  }
+
   // Issue diagnostics for unterminated #if and missing newline.
 
   // If we are in a #if directive, emit an error.
diff --git a/lib/Lex/PPDirectives.cpp b/lib/Lex/PPDirectives.cpp
index 030717b8bd5c..8b5877934f61 100644
--- a/lib/Lex/PPDirectives.cpp
+++ b/lib/Lex/PPDirectives.cpp
@@ -1906,6 +1906,25 @@ void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
     }
   }
 
+  // The #included file will be considered to be a system header if either it is
+  // in a system include directory, or if the #includer is a system include
+  // header.
+  SrcMgr::CharacteristicKind FileCharacter =
+      SourceMgr.getFileCharacteristic(FilenameTok.getLocation());
+  if (File)
+    FileCharacter = std::max(HeaderInfo.getFileDirFlavor(File), FileCharacter);
+
+  // Ask HeaderInfo if we should enter this #include file.  If not, #including
+  // this file will have no effect.
+  bool SkipHeader = false;
+  if (ShouldEnter && File &&
+      !HeaderInfo.ShouldEnterIncludeFile(*this, File, isImport,
+                                         getLangOpts().Modules,
+                                         SuggestedModule.getModule())) {
+    ShouldEnter = false;
+    SkipHeader = true;
+  }
+
   if (Callbacks) {
     // Notify the callback object that we've seen an inclusion directive.
     Callbacks->InclusionDirective(
@@ -1913,18 +1932,13 @@ void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
         LangOpts.MSVCCompat ? NormalizedPath.c_str() : Filename, isAngled,
         FilenameRange, File, SearchPath, RelativePath,
         ShouldEnter ? nullptr : SuggestedModule.getModule());
+    if (SkipHeader && !SuggestedModule.getModule())
+      Callbacks->FileSkipped(*File, FilenameTok, FileCharacter);
   }
 
   if (!File)
     return;
 
-  // The #included file will be considered to be a system header if either it is
-  // in a system include directory, or if the #includer is a system include
-  // header.
-  SrcMgr::CharacteristicKind FileCharacter =
-    std::max(HeaderInfo.getFileDirFlavor(File),
-             SourceMgr.getFileCharacteristic(FilenameTok.getLocation()));
-
   // FIXME: If we have a suggested module, and we've already visited this file,
   // don't bother entering it again. We know it has no further effect.
 
@@ -1964,19 +1978,6 @@ void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
     }
   }
 
-  // Ask HeaderInfo if we should enter this #include file.  If not, #including
-  // this file will have no effect.
-  bool SkipHeader = false;
-  if (ShouldEnter &&
-      !HeaderInfo.ShouldEnterIncludeFile(*this, File, isImport,
-                                         getLangOpts().Modules,
-                                         SuggestedModule.getModule())) {
-    ShouldEnter = false;
-    SkipHeader = true;
-    if (Callbacks)
-      Callbacks->FileSkipped(*File, FilenameTok, FileCharacter);
-  }
-
   // If we don't need to enter the file, stop now.
   if (!ShouldEnter) {
     // If this is a module import, make it visible if needed.
diff --git a/lib/Lex/PPLexerChange.cpp b/lib/Lex/PPLexerChange.cpp
index 5a589d6a17b3..1c0cd5636835 100644
--- a/lib/Lex/PPLexerChange.cpp
+++ b/lib/Lex/PPLexerChange.cpp
@@ -46,6 +46,12 @@ bool Preprocessor::isInPrimaryFile() const {
   });
 }
 
+bool Preprocessor::isInMainFile() const {
+  if (IsFileLexer())
+    return IncludeMacroStack.size() == 0;
+  return true;
+}
+
 /// getCurrentLexer - Return the current file lexer being lexed from.  Note
 /// that this ignores any potentially active macro expansions and _Pragma
 /// expansions going on at the time.
diff --git a/lib/Lex/Pragma.cpp b/lib/Lex/Pragma.cpp
index 2d078a4e7603..e1d981527bec 100644
--- a/lib/Lex/Pragma.cpp
+++ b/lib/Lex/Pragma.cpp
@@ -1407,6 +1407,24 @@ struct PragmaModuleBeginHandler : public PragmaHandler {
       M = NewM;
     }
 
+    // If the module isn't available, it doesn't make sense to enter it.
+    if (!M->isAvailable()) {
+      Module::Requirement Requirement;
+      Module::UnresolvedHeaderDirective MissingHeader;
+      (void)M->isAvailable(PP.getLangOpts(), PP.getTargetInfo(),
+                           Requirement, MissingHeader);
+      if (MissingHeader.FileNameLoc.isValid()) {
+        PP.Diag(MissingHeader.FileNameLoc, diag::err_module_header_missing)
+          << MissingHeader.IsUmbrella << MissingHeader.FileName;
+      } else {
+        PP.Diag(M->DefinitionLoc, diag::err_module_unavailable)
+          << M->getFullModuleName() << Requirement.second << Requirement.first;
+      }
+      PP.Diag(BeginLoc, diag::note_pp_module_begin_here)
+        << M->getTopLevelModuleName();
+      return;
+    }
+
     // Enter the scope of the submodule.
     PP.EnterSubmodule(M, BeginLoc, /*ForPragma*/true);
     PP.EnterAnnotationToken(SourceRange(BeginLoc, ModuleName.back().second),
diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp
index dce8c1efda23..3596337c245e 100644
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -150,6 +150,9 @@ Preprocessor::Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts,
     Ident_GetExceptionInfo = Ident_GetExceptionCode = nullptr;
     Ident_AbnormalTermination = nullptr;
   }
+
+  if (this->PPOpts->GeneratePreamble)
+    PreambleConditionalStack.startRecording();
 }
 
 Preprocessor::~Preprocessor() {
@@ -532,6 +535,12 @@ void Preprocessor::EnterMainSourceFile() {
 
   // Start parsing the predefines.
   EnterSourceFile(FID, nullptr, SourceLocation());
+
+  // Restore the conditional stack from the preamble, if there is one.
+  if (PreambleConditionalStack.isReplaying()) {
+    CurPPLexer->setConditionalLevels(PreambleConditionalStack.getStack());
+    PreambleConditionalStack.doneReplaying();
+  }
 }
 
 void Preprocessor::EndSourceFile() {
diff --git a/lib/Sema/SemaDecl.cpp b/lib/Sema/SemaDecl.cpp
index 96fd952c81c7..a9adbec4f842 100644
--- a/lib/Sema/SemaDecl.cpp
+++ b/lib/Sema/SemaDecl.cpp
@@ -12509,6 +12509,9 @@ NamedDecl *Sema::ImplicitlyDefineFunction(SourceLocation Loc,
   unsigned diag_id;
   if (II.getName().startswith("__builtin_"))
     diag_id = diag::warn_builtin_unknown;
+  // OpenCL v2.0 s6.9.u - Implicit function declaration is not supported.
+  else if (getLangOpts().OpenCL)
+    diag_id = diag::err_opencl_implicit_function_decl;
   else if (getLangOpts().C99)
     diag_id = diag::ext_implicit_function_decl;
   else
diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index 759c82eeaa07..b1a07ffb7206 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp
@@ -8074,28 +8074,38 @@ QualType Sema::InvalidLogicalVectorOperands(SourceLocation Loc, ExprResult &LHS,
 /// rank; for C, Obj-C, and C++ we allow any real scalar conversion except
 /// for float->int.
 ///
+/// OpenCL V2.0 6.2.6.p2:
+/// An error shall occur if any scalar operand type has greater rank
+/// than the type of the vector element.
+///
 /// \param scalar - if non-null, actually perform the conversions
 /// \return true if the operation fails (but without diagnosing the failure)
 static bool tryVectorConvertAndSplat(Sema &S, ExprResult *scalar,
                                      QualType scalarTy,
                                      QualType vectorEltTy,
-                                     QualType vectorTy) {
+                                     QualType vectorTy,
+                                     unsigned &DiagID) {
   // The conversion to apply to the scalar before splatting it,
   // if necessary.
   CastKind scalarCast = CK_Invalid;
   
   if (vectorEltTy->isIntegralType(S.Context)) {
-    if (!scalarTy->isIntegralType(S.Context))
+    if (S.getLangOpts().OpenCL && (scalarTy->isRealFloatingType() ||
+        (scalarTy->isIntegerType() &&
+         S.Context.getIntegerTypeOrder(vectorEltTy, scalarTy) < 0))) {
+      DiagID = diag::err_opencl_scalar_type_rank_greater_than_vector_type;
       return true;
-    if (S.getLangOpts().OpenCL &&
-        S.Context.getIntegerTypeOrder(vectorEltTy, scalarTy) < 0)
+    }
+    if (!scalarTy->isIntegralType(S.Context))
       return true;
     scalarCast = CK_IntegralCast;
   } else if (vectorEltTy->isRealFloatingType()) {
     if (scalarTy->isRealFloatingType()) {
       if (S.getLangOpts().OpenCL &&
-          S.Context.getFloatingTypeOrder(vectorEltTy, scalarTy) < 0)
+          S.Context.getFloatingTypeOrder(vectorEltTy, scalarTy) < 0) {
+        DiagID = diag::err_opencl_scalar_type_rank_greater_than_vector_type;
         return true;
+      }
       scalarCast = CK_FloatingCast;
     }
     else if (scalarTy->isIntegralType(S.Context))
@@ -8341,10 +8351,12 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
 
   // If there's a vector type and a scalar, try to convert the scalar to
   // the vector element type and splat.
+  unsigned DiagID = diag::err_typecheck_vector_not_convertable;
   if (!RHSVecType) {
     if (isa<ExtVectorType>(LHSVecType)) {
       if (!tryVectorConvertAndSplat(*this, &RHS, RHSType,
-                                    LHSVecType->getElementType(), LHSType))
+                                    LHSVecType->getElementType(), LHSType,
+                                    DiagID))
         return LHSType;
     } else {
       if (!tryGCCVectorConvertAndSplat(*this, &RHS, &LHS))
@@ -8355,7 +8367,7 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
     if (isa<ExtVectorType>(RHSVecType)) {
       if (!tryVectorConvertAndSplat(*this, (IsCompAssign ? nullptr : &LHS),
                                     LHSType, RHSVecType->getElementType(),
-                                    RHSType))
+                                    RHSType, DiagID))
         return RHSType;
     } else {
       if (LHS.get()->getValueKind() == VK_LValue ||
@@ -8431,7 +8443,7 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
   }
 
   // Otherwise, use the generic diagnostic.
-  Diag(Loc, diag::err_typecheck_vector_not_convertable)
+  Diag(Loc, DiagID)
     << LHSType << RHSType
     << LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
   return QualType();
diff --git a/lib/Sema/SemaOpenMP.cpp b/lib/Sema/SemaOpenMP.cpp
index 43fd055bbc56..2b7733d2adbd 100644
--- a/lib/Sema/SemaOpenMP.cpp
+++ b/lib/Sema/SemaOpenMP.cpp
@@ -5929,16 +5929,17 @@ StmtResult Sema::ActOnOpenMPTargetParallelForDirective(
                                                B, DSAStack->isCancelRegion());
 }
 
-/// \brief Check for existence of a map clause in the list of clauses.
-static bool HasMapClause(ArrayRef<OMPClause *> Clauses) {
-  for (ArrayRef<OMPClause *>::iterator I = Clauses.begin(), E = Clauses.end();
-       I != E; ++I) {
-    if (*I != nullptr && (*I)->getClauseKind() == OMPC_map) {
-      return true;
-    }
-  }
+/// Check for existence of a map clause in the list of clauses.
+static bool hasClauses(ArrayRef<OMPClause *> Clauses,
+                       const OpenMPClauseKind K) {
+  return llvm::any_of(
+      Clauses, [K](const OMPClause *C) { return C->getClauseKind() == K; });
+}
 
-  return false;
+template <typename... Params>
+static bool hasClauses(ArrayRef<OMPClause *> Clauses, const OpenMPClauseKind K,
+                       const Params... ClauseTypes) {
+  return hasClauses(Clauses, K) || hasClauses(Clauses, ClauseTypes...);
 }
 
 StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
@@ -5952,8 +5953,9 @@ StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
 
   // OpenMP [2.10.1, Restrictions, p. 97]
   // At least one map clause must appear on the directive.
-  if (!HasMapClause(Clauses)) {
-    Diag(StartLoc, diag::err_omp_no_map_for_directive)
+  if (!hasClauses(Clauses, OMPC_map, OMPC_use_device_ptr)) {
+    Diag(StartLoc, diag::err_omp_no_clause_for_directive)
+        << "'map' or 'use_device_ptr'"
         << getOpenMPDirectiveName(OMPD_target_data);
     return StmtError();
   }
@@ -5970,9 +5972,9 @@ Sema::ActOnOpenMPTargetEnterDataDirective(ArrayRef<OMPClause *> Clauses,
                                           SourceLocation EndLoc) {
   // OpenMP [2.10.2, Restrictions, p. 99]
   // At least one map clause must appear on the directive.
-  if (!HasMapClause(Clauses)) {
-    Diag(StartLoc, diag::err_omp_no_map_for_directive)
-        << getOpenMPDirectiveName(OMPD_target_enter_data);
+  if (!hasClauses(Clauses, OMPC_map)) {
+    Diag(StartLoc, diag::err_omp_no_clause_for_directive)
+        << "'map'" << getOpenMPDirectiveName(OMPD_target_enter_data);
     return StmtError();
   }
 
@@ -5986,9 +5988,9 @@ Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
                                          SourceLocation EndLoc) {
   // OpenMP [2.10.3, Restrictions, p. 102]
   // At least one map clause must appear on the directive.
-  if (!HasMapClause(Clauses)) {
-    Diag(StartLoc, diag::err_omp_no_map_for_directive)
-        << getOpenMPDirectiveName(OMPD_target_exit_data);
+  if (!hasClauses(Clauses, OMPC_map)) {
+    Diag(StartLoc, diag::err_omp_no_clause_for_directive)
+        << "'map'" << getOpenMPDirectiveName(OMPD_target_exit_data);
     return StmtError();
   }
 
@@ -5998,12 +6000,7 @@ Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
 StmtResult Sema::ActOnOpenMPTargetUpdateDirective(ArrayRef<OMPClause *> Clauses,
                                                   SourceLocation StartLoc,
                                                   SourceLocation EndLoc) {
-  bool seenMotionClause = false;
-  for (auto *C : Clauses) {
-    if (C->getClauseKind() == OMPC_to || C->getClauseKind() == OMPC_from)
-      seenMotionClause = true;
-  }
-  if (!seenMotionClause) {
+  if (!hasClauses(Clauses, OMPC_to, OMPC_from)) {
     Diag(StartLoc, diag::err_omp_at_least_one_motion_clause_required);
     return StmtError();
   }
diff --git a/lib/Serialization/ASTReader.cpp b/lib/Serialization/ASTReader.cpp
index 55cb670f427e..b7bbb9dc7be1 100644
--- a/lib/Serialization/ASTReader.cpp
+++ b/lib/Serialization/ASTReader.cpp
@@ -2925,6 +2925,21 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
       }
       break;
 
+    case PP_CONDITIONAL_STACK:
+      if (!Record.empty()) {
+        SmallVector<PPConditionalInfo, 4> ConditionalStack;
+        for (unsigned Idx = 0, N = Record.size() - 1; Idx < N; /* in loop */) {
+          auto Loc = ReadSourceLocation(F, Record, Idx);
+          bool WasSkipping = Record[Idx++];
+          bool FoundNonSkip = Record[Idx++];
+          bool FoundElse = Record[Idx++];
+          ConditionalStack.push_back(
+              {Loc, WasSkipping, FoundNonSkip, FoundElse});
+        }
+        PP.setReplayablePreambleConditionalStack(ConditionalStack);
+      }
+      break;
+
     case PP_COUNTER_VALUE:
       if (!Record.empty() && Listener)
         Listener->ReadCounter(F, Record[0]);
diff --git a/lib/Serialization/ASTWriter.cpp b/lib/Serialization/ASTWriter.cpp
index b3556371c9b8..c931b13f65f3 100644
--- a/lib/Serialization/ASTWriter.cpp
+++ b/lib/Serialization/ASTWriter.cpp
@@ -1093,6 +1093,7 @@ void ASTWriter::WriteBlockInfoBlock() {
   RECORD(UNUSED_LOCAL_TYPEDEF_NAME_CANDIDATES);
   RECORD(DELETE_EXPRS_TO_ANALYZE);
   RECORD(CUDA_PRAGMA_FORCE_HOST_DEVICE_DEPTH);
+  RECORD(PP_CONDITIONAL_STACK);
 
   // SourceManager Block.
   BLOCK(SOURCE_MANAGER_BLOCK);
@@ -2302,6 +2303,18 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
     Stream.EmitRecord(PP_COUNTER_VALUE, Record);
   }
 
+  if (PP.isRecordingPreamble() && PP.hasRecordedPreamble()) {
+    assert(!IsModule);
+    for (const auto &Cond : PP.getPreambleConditionalStack()) {
+      AddSourceLocation(Cond.IfLoc, Record);
+      Record.push_back(Cond.WasSkipping);
+      Record.push_back(Cond.FoundNonSkip);
+      Record.push_back(Cond.FoundElse);
+    }
+    Stream.EmitRecord(PP_CONDITIONAL_STACK, Record);
+    Record.clear();
+  }
+
   // Enter the preprocessor block.
   Stream.EnterSubblock(PREPROCESSOR_BLOCK_ID, 3);
 
diff --git a/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/lib/StaticAnalyzer/Checkers/CMakeLists.txt
index 60d60bc074eb..2759240dd276 100644
--- a/lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ b/lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -39,7 +39,7 @@ add_clang_library(clangStaticAnalyzerCheckers
   GenericTaintChecker.cpp
   GTestChecker.cpp
   IdenticalExprChecker.cpp
-  IteratorPastEndChecker.cpp
+  IteratorChecker.cpp
   IvarInvalidationChecker.cpp
   LLVMConventionsChecker.cpp
   LocalizationChecker.cpp
diff --git a/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
index b1a54e77951b..883c6a663291 100644
--- a/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -65,21 +65,8 @@ private:
   /// and thus, is tainted.
   static bool isStdin(const Expr *E, CheckerContext &C);
 
-  /// This is called from getPointedToSymbol() to resolve symbol references for
-  /// the region underlying a LazyCompoundVal. This is the default binding
-  /// for the LCV, which could be a conjured symbol from a function call that
-  /// initialized the region. It only returns the conjured symbol if the LCV
-  /// covers the entire region, e.g. we avoid false positives by not returning
-  /// a default bindingc for an entire struct if the symbol for only a single
-  /// field or element within it is requested.
-  // TODO: Return an appropriate symbol for sub-fields/elements of an LCV so
-  // that they are also appropriately tainted.
-  static SymbolRef getLCVSymbol(CheckerContext &C,
-                                nonloc::LazyCompoundVal &LCV);
-
-  /// \brief Given a pointer argument, get the symbol of the value it contains
-  /// (points to).
-  static SymbolRef getPointedToSymbol(CheckerContext &C, const Expr *Arg);
+  /// \brief Given a pointer argument, return the value it points to.
+  static Optional<SVal> getPointedToSVal(CheckerContext &C, const Expr *Arg);
 
   /// Functions defining the attack surface.
   typedef ProgramStateRef (GenericTaintChecker::*FnCheck)(const CallExpr *,
@@ -186,9 +173,14 @@ private:
     static inline bool isTaintedOrPointsToTainted(const Expr *E,
                                                   ProgramStateRef State,
                                                   CheckerContext &C) {
-      return (State->isTainted(E, C.getLocationContext()) || isStdin(E, C) ||
-              (E->getType().getTypePtr()->isPointerType() &&
-               State->isTainted(getPointedToSymbol(C, E))));
+      if (State->isTainted(E, C.getLocationContext()) || isStdin(E, C))
+        return true;
+
+      if (!E->getType().getTypePtr()->isPointerType())
+        return false;
+
+      Optional<SVal> V = getPointedToSVal(C, E);
+      return (V && State->isTainted(*V));
     }
 
     /// \brief Pre-process a function which propagates taint according to the
@@ -400,9 +392,9 @@ bool GenericTaintChecker::propagateFromPre(const CallExpr *CE,
     if (CE->getNumArgs() < (ArgNum + 1))
       return false;
     const Expr* Arg = CE->getArg(ArgNum);
-    SymbolRef Sym = getPointedToSymbol(C, Arg);
-    if (Sym)
-      State = State->addTaint(Sym);
+    Optional<SVal> V = getPointedToSVal(C, Arg);
+    if (V)
+      State = State->addTaint(*V);
   }
 
   // Clear up the taint info from the state.
@@ -473,47 +465,20 @@ bool GenericTaintChecker::checkPre(const CallExpr *CE, CheckerContext &C) const{
   return false;
 }
 
-SymbolRef GenericTaintChecker::getLCVSymbol(CheckerContext &C,
-                                            nonloc::LazyCompoundVal &LCV) {
-  StoreManager &StoreMgr = C.getStoreManager();
-
-  // getLCVSymbol() is reached in a PostStmt so we can always expect a default
-  // binding to exist if one is present.
-  if (Optional<SVal> binding = StoreMgr.getDefaultBinding(LCV)) {
-    SymbolRef Sym = binding->getAsSymbol();
-    if (!Sym)
-      return nullptr;
-
-    // If the LCV covers an entire base region return the default conjured symbol.
-    if (LCV.getRegion() == LCV.getRegion()->getBaseRegion())
-      return Sym;
-  }
-
-  // Otherwise, return a nullptr as there's not yet a functional way to taint
-  // sub-regions of LCVs.
-  return nullptr;
-}
-
-SymbolRef GenericTaintChecker::getPointedToSymbol(CheckerContext &C,
-                                                  const Expr* Arg) {
+Optional<SVal> GenericTaintChecker::getPointedToSVal(CheckerContext &C,
+                                            const Expr* Arg) {
   ProgramStateRef State = C.getState();
   SVal AddrVal = State->getSVal(Arg->IgnoreParens(), C.getLocationContext());
   if (AddrVal.isUnknownOrUndef())
-    return nullptr;
+    return None;
 
   Optional<Loc> AddrLoc = AddrVal.getAs<Loc>();
   if (!AddrLoc)
-    return nullptr;
+    return None;
 
   const PointerType *ArgTy =
     dyn_cast<PointerType>(Arg->getType().getCanonicalType().getTypePtr());
-  SVal Val = State->getSVal(*AddrLoc,
-                            ArgTy ? ArgTy->getPointeeType(): QualType());
-
-  if (auto LCV = Val.getAs<nonloc::LazyCompoundVal>())
-    return getLCVSymbol(C, *LCV);
-
-  return Val.getAsSymbol();
+  return State->getSVal(*AddrLoc, ArgTy ? ArgTy->getPointeeType(): QualType());
 }
 
 ProgramStateRef
@@ -633,9 +598,9 @@ ProgramStateRef GenericTaintChecker::postScanf(const CallExpr *CE,
     // The arguments are pointer arguments. The data they are pointing at is
     // tainted after the call.
     const Expr* Arg = CE->getArg(i);
-        SymbolRef Sym = getPointedToSymbol(C, Arg);
-    if (Sym)
-      State = State->addTaint(Sym);
+    Optional<SVal> V = getPointedToSVal(C, Arg);
+    if (V)
+      State = State->addTaint(*V);
   }
   return State;
 }
@@ -710,10 +675,10 @@ bool GenericTaintChecker::generateReportIfTainted(const Expr *E,
 
   // Check for taint.
   ProgramStateRef State = C.getState();
-  const SymbolRef PointedToSym = getPointedToSymbol(C, E);
+  Optional<SVal> PointedToSVal = getPointedToSVal(C, E);
   SVal TaintedSVal;
-  if (State->isTainted(PointedToSym))
-    TaintedSVal = nonloc::SymbolVal(PointedToSym);
+  if (PointedToSVal && State->isTainted(*PointedToSVal))
+    TaintedSVal = *PointedToSVal;
   else if (State->isTainted(E, C.getLocationContext()))
     TaintedSVal = C.getSVal(E);
   else
diff --git a/lib/StaticAnalyzer/Checkers/IteratorChecker.cpp b/lib/StaticAnalyzer/Checkers/IteratorChecker.cpp
new file mode 100644
index 000000000000..0f9b749506fa
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/IteratorChecker.cpp
@@ -0,0 +1,833 @@
+//===-- IteratorChecker.cpp ---------------------------------------*- C++ -*--//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines a checker for using iterators outside their range (past end). Usage
+// means here dereferencing, incrementing etc.
+//
+//===----------------------------------------------------------------------===//
+//
+// In the code, iterator can be represented as a:
+// * type-I: typedef-ed pointer. Operations over such iterator, such as
+//           comparisons or increments, are modeled straightforwardly by the
+//           analyzer.
+// * type-II: structure with its method bodies available.  Operations over such
+//            iterator are inlined by the analyzer, and results of modeling
+//            these operations are exposing implementation details of the
+//            iterators, which is not necessarily helping.
+// * type-III: completely opaque structure. Operations over such iterator are
+//             modeled conservatively, producing conjured symbols everywhere.
+//
+// To handle all these types in a common way we introduce a structure called
+// IteratorPosition which is an abstraction of the position the iterator
+// represents using symbolic expressions. The checker handles all the
+// operations on this structure.
+//
+// Additionally, depending on the circumstances, operators of types II and III
+// can be represented as:
+// * type-IIa, type-IIIa: conjured structure symbols - when returned by value
+//                        from conservatively evaluated methods such as
+//                        `.begin()`.
+// * type-IIb, type-IIIb: memory regions of iterator-typed objects, such as
+//                        variables or temporaries, when the iterator object is
+//                        currently treated as an lvalue.
+// * type-IIc, type-IIIc: compound values of iterator-typed objects, when the
+//                        iterator object is treated as an rvalue taken of a
+//                        particular lvalue, eg. a copy of "type-a" iterator
+//                        object, or an iterator that existed before the
+//                        analysis has started.
+//
+// To handle any of these three different representations stored in an SVal we
+// use setter and getters functions which separate the three cases. To store
+// them we use a pointer union of symbol and memory region.
+//
+// The checker works the following way: We record the past-end iterator for
+// all containers whenever their `.end()` is called. Since the Constraint
+// Manager cannot handle SVals we need to take over its role. We post-check
+// equality and non-equality comparisons and propagate the position of the
+// iterator to the other side of the comparison if it is past-end and we are in
+// the 'equal' branch (true-branch for `==` and false-branch for `!=`).
+//
+// In case of type-I or type-II iterators we get a concrete integer as a result
+// of the comparison (1 or 0) but in case of type-III we only get a Symbol. In
+// this latter case we record the symbol and reload it in evalAssume() and do
+// the propagation there. We also handle (maybe double) negated comparisons
+// which are represented in the form of (x == 0 or x !=0 ) where x is the
+// comparison itself.
+
+#include "ClangSACheckers.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+
+// Abstract position of an iterator. This helps to handle all three kinds
+// of operators in a common way by using a symbolic position.
+struct IteratorPosition {
+private:
+
+  // Container the iterator belongs to
+  const MemRegion *Cont;
+
+  // Abstract offset
+  SymbolRef Offset;
+
+  IteratorPosition(const MemRegion *C, SymbolRef Of)
+      : Cont(C), Offset(Of) {}
+
+public:
+  const MemRegion *getContainer() const { return Cont; }
+  SymbolRef getOffset() const { return Offset; }
+
+  static IteratorPosition getPosition(const MemRegion *C, SymbolRef Of) {
+    return IteratorPosition(C, Of);
+  }
+
+  IteratorPosition setTo(SymbolRef NewOf) const {
+    return IteratorPosition(Cont, NewOf);
+  }
+
+  bool operator==(const IteratorPosition &X) const {
+    return Cont == X.Cont && Offset == X.Offset;
+  }
+
+  bool operator!=(const IteratorPosition &X) const {
+    return Cont != X.Cont || Offset != X.Offset;
+  }
+
+  void Profile(llvm::FoldingSetNodeID &ID) const {
+    ID.AddPointer(Cont);
+    ID.Add(Offset);
+  }
+};
+
+typedef llvm::PointerUnion<const MemRegion *, SymbolRef> RegionOrSymbol;
+
+// Structure to record the symbolic end position of a container
+struct ContainerData {
+private:
+  SymbolRef End;
+
+  ContainerData(SymbolRef E) : End(E) {}
+
+public:
+  static ContainerData fromEnd(SymbolRef E) {
+    return ContainerData(E);
+  }
+
+  SymbolRef getEnd() const { return End; }
+
+  ContainerData newEnd(SymbolRef E) const { return ContainerData(E); }
+
+  bool operator==(const ContainerData &X) const {
+    return End == X.End;
+  }
+
+  bool operator!=(const ContainerData &X) const {
+    return End != X.End;
+  }
+
+  void Profile(llvm::FoldingSetNodeID &ID) const {
+    ID.Add(End);
+  }
+};
+
+// Structure fo recording iterator comparisons. We needed to retrieve the
+// original comparison expression in assumptions.
+struct IteratorComparison {
+private:
+  RegionOrSymbol Left, Right;
+  bool Equality;
+
+public:
+  IteratorComparison(RegionOrSymbol L, RegionOrSymbol R, bool Eq)
+      : Left(L), Right(R), Equality(Eq) {}
+
+  RegionOrSymbol getLeft() const { return Left; }
+  RegionOrSymbol getRight() const { return Right; }
+  bool isEquality() const { return Equality; }
+  bool operator==(const IteratorComparison &X) const {
+    return Left == X.Left && Right == X.Right && Equality == X.Equality;
+  }
+  bool operator!=(const IteratorComparison &X) const {
+    return Left != X.Left || Right != X.Right || Equality != X.Equality;
+  }
+  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(Equality); }
+};
+
+class IteratorChecker
+    : public Checker<check::PreCall, check::PostCall,
+                     check::PostStmt<MaterializeTemporaryExpr>,
+                     check::DeadSymbols,
+                     eval::Assume> {
+
+  std::unique_ptr<BugType> OutOfRangeBugType;
+
+  void handleComparison(CheckerContext &C, const SVal &RetVal, const SVal &LVal,
+                        const SVal &RVal, OverloadedOperatorKind Op) const;
+  void verifyDereference(CheckerContext &C, const SVal &Val) const;
+  void handleEnd(CheckerContext &C, const Expr *CE, const SVal &RetVal,
+                 const SVal &Cont) const;
+  void assignToContainer(CheckerContext &C, const Expr *CE, const SVal &RetVal,
+                         const MemRegion *Cont) const;
+  void reportOutOfRangeBug(const StringRef &Message, const SVal &Val,
+                           CheckerContext &C, ExplodedNode *ErrNode) const;
+
+public:
+  IteratorChecker();
+
+  enum CheckKind {
+    CK_IteratorRangeChecker,
+    CK_NumCheckKinds
+  };
+
+  DefaultBool ChecksEnabled[CK_NumCheckKinds];
+  CheckName CheckNames[CK_NumCheckKinds];
+
+  void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
+  void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
+  void checkPostStmt(const MaterializeTemporaryExpr *MTE,
+                     CheckerContext &C) const;
+  void checkDeadSymbols(SymbolReaper &SR, CheckerContext &C) const;
+  ProgramStateRef evalAssume(ProgramStateRef State, SVal Cond,
+                             bool Assumption) const;
+};
+} // namespace
+
+REGISTER_MAP_WITH_PROGRAMSTATE(IteratorSymbolMap, SymbolRef, IteratorPosition)
+REGISTER_MAP_WITH_PROGRAMSTATE(IteratorRegionMap, const MemRegion *,
+                               IteratorPosition)
+
+REGISTER_MAP_WITH_PROGRAMSTATE(ContainerMap, const MemRegion *, ContainerData)
+
+REGISTER_MAP_WITH_PROGRAMSTATE(IteratorComparisonMap, const SymExpr *,
+                               IteratorComparison)
+
+namespace {
+
+bool isIteratorType(const QualType &Type);
+bool isIterator(const CXXRecordDecl *CRD);
+bool isEndCall(const FunctionDecl *Func);
+bool isSimpleComparisonOperator(OverloadedOperatorKind OK);
+bool isDereferenceOperator(OverloadedOperatorKind OK);
+BinaryOperator::Opcode getOpcode(const SymExpr *SE);
+const RegionOrSymbol getRegionOrSymbol(const SVal &Val);
+const ProgramStateRef processComparison(ProgramStateRef State,
+                                        RegionOrSymbol LVal,
+                                        RegionOrSymbol RVal, bool Equal);
+const ProgramStateRef saveComparison(ProgramStateRef State,
+                                     const SymExpr *Condition, const SVal &LVal,
+                                     const SVal &RVal, bool Eq);
+const IteratorComparison *loadComparison(ProgramStateRef State,
+                                         const SymExpr *Condition);
+SymbolRef getContainerEnd(ProgramStateRef State, const MemRegion *Cont);
+ProgramStateRef createContainerEnd(ProgramStateRef State, const MemRegion *Cont,
+                                   const SymbolRef Sym);
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            const SVal &Val);
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            RegionOrSymbol RegOrSym);
+ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
+                                    const IteratorPosition &Pos);
+ProgramStateRef setIteratorPosition(ProgramStateRef State,
+                                    RegionOrSymbol RegOrSym,
+                                    const IteratorPosition &Pos);
+ProgramStateRef removeIteratorPosition(ProgramStateRef State, const SVal &Val);
+ProgramStateRef adjustIteratorPosition(ProgramStateRef State,
+                                       RegionOrSymbol RegOrSym,
+                                       const IteratorPosition &Pos, bool Equal);
+ProgramStateRef relateIteratorPositions(ProgramStateRef State,
+                                        const IteratorPosition &Pos1,
+                                        const IteratorPosition &Pos2,
+                                        bool Equal);
+const ContainerData *getContainerData(ProgramStateRef State,
+                                      const MemRegion *Cont);
+ProgramStateRef setContainerData(ProgramStateRef State, const MemRegion *Cont,
+                                 const ContainerData &CData);
+bool isOutOfRange(ProgramStateRef State, const IteratorPosition &Pos);
+} // namespace
+
+IteratorChecker::IteratorChecker() {
+  OutOfRangeBugType.reset(
+      new BugType(this, "Iterator out of range", "Misuse of STL APIs"));
+  OutOfRangeBugType->setSuppressOnSink(true);
+}
+
+void IteratorChecker::checkPreCall(const CallEvent &Call,
+                                   CheckerContext &C) const {
+  // Check for out of range access
+  const auto *Func = dyn_cast_or_null<FunctionDecl>(Call.getDecl());
+  if (!Func)
+    return;
+
+  if (Func->isOverloadedOperator()) {
+    if (ChecksEnabled[CK_IteratorRangeChecker] &&
+               isDereferenceOperator(Func->getOverloadedOperator())) {
+      // Check for dereference of out-of-range iterators
+      if (const auto *InstCall = dyn_cast<CXXInstanceCall>(&Call)) {
+        verifyDereference(C, InstCall->getCXXThisVal());
+      } else {
+        verifyDereference(C, Call.getArgSVal(0));
+      }
+    }
+  }
+}
+
+void IteratorChecker::checkPostCall(const CallEvent &Call,
+                                    CheckerContext &C) const {
+  // Record new iterator positions and iterator position changes
+  const auto *Func = dyn_cast_or_null<FunctionDecl>(Call.getDecl());
+  if (!Func)
+    return;
+
+  if (Func->isOverloadedOperator()) {
+    const auto Op = Func->getOverloadedOperator();
+    if (isSimpleComparisonOperator(Op)) {
+      if (const auto *InstCall = dyn_cast<CXXInstanceCall>(&Call)) {
+        handleComparison(C, Call.getReturnValue(), InstCall->getCXXThisVal(),
+                         Call.getArgSVal(0), Op);
+      } else {
+        handleComparison(C, Call.getReturnValue(), Call.getArgSVal(0),
+                         Call.getArgSVal(1), Op);
+      }
+    }
+  } else {
+    const auto *OrigExpr = Call.getOriginExpr();
+    if (!OrigExpr)
+      return;
+
+    if (!isIteratorType(Call.getResultType()))
+      return;
+
+    auto State = C.getState();
+    // Already bound to container?
+    if (getIteratorPosition(State, Call.getReturnValue()))
+      return;
+
+    if (const auto *InstCall = dyn_cast<CXXInstanceCall>(&Call)) {
+      if (isEndCall(Func)) {
+        handleEnd(C, OrigExpr, Call.getReturnValue(),
+                  InstCall->getCXXThisVal());
+        return;
+      }
+    }
+
+    // Copy-like and move constructors
+    if (isa<CXXConstructorCall>(&Call) && Call.getNumArgs() == 1) {
+      if (const auto *Pos = getIteratorPosition(State, Call.getArgSVal(0))) {
+        State = setIteratorPosition(State, Call.getReturnValue(), *Pos);
+        if (cast<CXXConstructorDecl>(Func)->isMoveConstructor()) {
+          State = removeIteratorPosition(State, Call.getArgSVal(0));
+        }
+        C.addTransition(State);
+        return;
+      }
+    }
+
+    // Assumption: if return value is an iterator which is not yet bound to a
+    //             container, then look for the first iterator argument, and
+    //             bind the return value to the same container. This approach
+    //             works for STL algorithms.
+    // FIXME: Add a more conservative mode
+    for (unsigned i = 0; i < Call.getNumArgs(); ++i) {
+      if (isIteratorType(Call.getArgExpr(i)->getType())) {
+        if (const auto *Pos = getIteratorPosition(State, Call.getArgSVal(i))) {
+          assignToContainer(C, OrigExpr, Call.getReturnValue(),
+                            Pos->getContainer());
+          return;
+        }
+      }
+    }
+  }
+}
+
+void IteratorChecker::checkPostStmt(const MaterializeTemporaryExpr *MTE,
+                                    CheckerContext &C) const {
+  /* Transfer iterator state to temporary objects */
+  auto State = C.getState();
+  const auto *LCtx = C.getLocationContext();
+  const auto *Pos =
+      getIteratorPosition(State, State->getSVal(MTE->GetTemporaryExpr(), LCtx));
+  if (!Pos)
+    return;
+  State = setIteratorPosition(State, State->getSVal(MTE, LCtx), *Pos);
+  C.addTransition(State);
+}
+
+void IteratorChecker::checkDeadSymbols(SymbolReaper &SR,
+                                       CheckerContext &C) const {
+  // Cleanup
+  auto State = C.getState();
+
+  auto RegionMap = State->get<IteratorRegionMap>();
+  for (const auto Reg : RegionMap) {
+    if (!SR.isLiveRegion(Reg.first)) {
+      State = State->remove<IteratorRegionMap>(Reg.first);
+    }
+  }
+
+  auto SymbolMap = State->get<IteratorSymbolMap>();
+  for (const auto Sym : SymbolMap) {
+    if (!SR.isLive(Sym.first)) {
+      State = State->remove<IteratorSymbolMap>(Sym.first);
+    }
+  }
+
+  auto ContMap = State->get<ContainerMap>();
+  for (const auto Cont : ContMap) {
+    if (!SR.isLiveRegion(Cont.first)) {
+      State = State->remove<ContainerMap>(Cont.first);
+    }
+  }
+
+  auto ComparisonMap = State->get<IteratorComparisonMap>();
+  for (const auto Comp : ComparisonMap) {
+    if (!SR.isLive(Comp.first)) {
+      State = State->remove<IteratorComparisonMap>(Comp.first);
+    }
+  }
+}
+
+ProgramStateRef IteratorChecker::evalAssume(ProgramStateRef State, SVal Cond,
+                                            bool Assumption) const {
+  // Load recorded comparison and transfer iterator state between sides
+  // according to comparison operator and assumption
+  const auto *SE = Cond.getAsSymExpr();
+  if (!SE)
+    return State;
+
+  auto Opc = getOpcode(SE);
+  if (Opc != BO_EQ && Opc != BO_NE)
+    return State;
+
+  bool Negated = false;
+  const auto *Comp = loadComparison(State, SE);
+  if (!Comp) {
+    // Try negated comparison, which is a SymExpr to 0 integer comparison
+    const auto *SIE = dyn_cast<SymIntExpr>(SE);
+    if (!SIE)
+      return State;
+
+    if (SIE->getRHS() != 0)
+      return State;
+
+    SE = SIE->getLHS();
+    Negated = SIE->getOpcode() == BO_EQ; // Equal to zero means negation
+    Opc = getOpcode(SE);
+    if (Opc != BO_EQ && Opc != BO_NE)
+      return State;
+
+    Comp = loadComparison(State, SE);
+    if (!Comp)
+      return State;
+  }
+
+  return processComparison(State, Comp->getLeft(), Comp->getRight(),
+                           (Comp->isEquality() == Assumption) != Negated);
+}
+
+void IteratorChecker::handleComparison(CheckerContext &C, const SVal &RetVal,
+                                       const SVal &LVal, const SVal &RVal,
+                                       OverloadedOperatorKind Op) const {
+  // Record the operands and the operator of the comparison for the next
+  // evalAssume, if the result is a symbolic expression. If it is a concrete
+  // value (only one branch is possible), then transfer the state between
+  // the operands according to the operator and the result
+  auto State = C.getState();
+  if (const auto *Condition = RetVal.getAsSymbolicExpression()) {
+    const auto *LPos = getIteratorPosition(State, LVal);
+    const auto *RPos = getIteratorPosition(State, RVal);
+    if (!LPos && !RPos)
+      return;
+    State = saveComparison(State, Condition, LVal, RVal, Op == OO_EqualEqual);
+    C.addTransition(State);
+  } else if (const auto TruthVal = RetVal.getAs<nonloc::ConcreteInt>()) {
+    if ((State = processComparison(
+             State, getRegionOrSymbol(LVal), getRegionOrSymbol(RVal),
+             (Op == OO_EqualEqual) == (TruthVal->getValue() != 0)))) {
+      C.addTransition(State);
+    } else {
+      C.generateSink(State, C.getPredecessor());
+    }
+  }
+}
+
+void IteratorChecker::verifyDereference(CheckerContext &C,
+                                        const SVal &Val) const {
+  auto State = C.getState();
+  const auto *Pos = getIteratorPosition(State, Val);
+  if (Pos && isOutOfRange(State, *Pos)) {
+    // If I do not put a tag here, some range tests will fail
+    static CheckerProgramPointTag Tag("IteratorRangeChecker",
+                                      "IteratorOutOfRange");
+    auto *N = C.generateNonFatalErrorNode(State, &Tag);
+    if (!N) {
+      return;
+    }
+    reportOutOfRangeBug("Iterator accessed outside of its range.", Val, C, N);
+  }
+}
+
+void IteratorChecker::handleEnd(CheckerContext &C, const Expr *CE,
+                                const SVal &RetVal, const SVal &Cont) const {
+  const auto *ContReg = Cont.getAsRegion();
+  if (!ContReg)
+    return;
+
+  while (const auto *CBOR = ContReg->getAs<CXXBaseObjectRegion>()) {
+    ContReg = CBOR->getSuperRegion();
+  }
+
+  // If the container already has an end symbol then use it. Otherwise first
+  // create a new one.
+  auto State = C.getState();
+  auto EndSym = getContainerEnd(State, ContReg);
+  if (!EndSym) {
+    auto &SymMgr = C.getSymbolManager();
+    EndSym = SymMgr.conjureSymbol(CE, C.getLocationContext(),
+                                  C.getASTContext().LongTy, C.blockCount());
+    State = createContainerEnd(State, ContReg, EndSym);
+  }
+  State = setIteratorPosition(State, RetVal,
+                              IteratorPosition::getPosition(ContReg, EndSym));
+  C.addTransition(State);
+}
+
+void IteratorChecker::assignToContainer(CheckerContext &C, const Expr *CE,
+                                        const SVal &RetVal,
+                                        const MemRegion *Cont) const {
+  while (const auto *CBOR = Cont->getAs<CXXBaseObjectRegion>()) {
+    Cont = CBOR->getSuperRegion();
+  }
+
+  auto State = C.getState();
+  auto &SymMgr = C.getSymbolManager();
+  auto Sym = SymMgr.conjureSymbol(CE, C.getLocationContext(),
+                                  C.getASTContext().LongTy, C.blockCount());
+  State = setIteratorPosition(State, RetVal,
+                              IteratorPosition::getPosition(Cont, Sym));
+  C.addTransition(State);
+}
+
+void IteratorChecker::reportOutOfRangeBug(const StringRef &Message,
+                                          const SVal &Val, CheckerContext &C,
+                                          ExplodedNode *ErrNode) const {
+  auto R = llvm::make_unique<BugReport>(*OutOfRangeBugType, Message, ErrNode);
+  R->markInteresting(Val);
+  C.emitReport(std::move(R));
+}
+
+namespace {
+
+bool isGreaterOrEqual(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2);
+bool compare(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2,
+             BinaryOperator::Opcode Opc);
+
+bool isIteratorType(const QualType &Type) {
+  if (Type->isPointerType())
+    return true;
+
+  const auto *CRD = Type->getUnqualifiedDesugaredType()->getAsCXXRecordDecl();
+  return isIterator(CRD);
+}
+
+bool isIterator(const CXXRecordDecl *CRD) {
+  if (!CRD)
+    return false;
+
+  const auto Name = CRD->getName();
+  if (!(Name.endswith_lower("iterator") || Name.endswith_lower("iter") ||
+        Name.endswith_lower("it")))
+    return false;
+
+  bool HasCopyCtor = false, HasCopyAssign = true, HasDtor = false,
+       HasPreIncrOp = false, HasPostIncrOp = false, HasDerefOp = false;
+  for (const auto *Method : CRD->methods()) {
+    if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(Method)) {
+      if (Ctor->isCopyConstructor()) {
+        HasCopyCtor = !Ctor->isDeleted() && Ctor->getAccess() == AS_public;
+      }
+      continue;
+    }
+    if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(Method)) {
+      HasDtor = !Dtor->isDeleted() && Dtor->getAccess() == AS_public;
+      continue;
+    }
+    if (Method->isCopyAssignmentOperator()) {
+      HasCopyAssign = !Method->isDeleted() && Method->getAccess() == AS_public;
+      continue;
+    }
+    if (!Method->isOverloadedOperator())
+      continue;
+    const auto OPK = Method->getOverloadedOperator();
+    if (OPK == OO_PlusPlus) {
+      HasPreIncrOp = HasPreIncrOp || (Method->getNumParams() == 0);
+      HasPostIncrOp = HasPostIncrOp || (Method->getNumParams() == 1);
+      continue;
+    }
+    if (OPK == OO_Star) {
+      HasDerefOp = (Method->getNumParams() == 0);
+      continue;
+    }
+  }
+
+  return HasCopyCtor && HasCopyAssign && HasDtor && HasPreIncrOp &&
+         HasPostIncrOp && HasDerefOp;
+}
+
+bool isEndCall(const FunctionDecl *Func) {
+  const auto *IdInfo = Func->getIdentifier();
+  if (!IdInfo)
+    return false;
+  return IdInfo->getName().endswith_lower("end");
+}
+
+bool isSimpleComparisonOperator(OverloadedOperatorKind OK) {
+  return OK == OO_EqualEqual || OK == OO_ExclaimEqual;
+}
+
+bool isDereferenceOperator(OverloadedOperatorKind OK) {
+  return OK == OO_Star || OK == OO_Arrow || OK == OO_ArrowStar ||
+         OK == OO_Subscript;
+}
+
+BinaryOperator::Opcode getOpcode(const SymExpr *SE) {
+  if (const auto *BSE = dyn_cast<BinarySymExpr>(SE)) {
+    return BSE->getOpcode();
+  } else if (const auto *SC = dyn_cast<SymbolConjured>(SE)) {
+    const auto *COE = dyn_cast<CXXOperatorCallExpr>(SC->getStmt());
+    if (!COE)
+      return BO_Comma; // Extremal value, neither EQ nor NE
+    if (COE->getOperator() == OO_EqualEqual) {
+      return BO_EQ;
+    } else if (COE->getOperator() == OO_ExclaimEqual) {
+      return BO_NE;
+    }
+    return BO_Comma; // Extremal value, neither EQ nor NE
+  }
+  return BO_Comma; // Extremal value, neither EQ nor NE
+}
+
+const RegionOrSymbol getRegionOrSymbol(const SVal &Val) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return Reg;
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return Sym;
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return LCVal->getRegion();
+  }
+  return RegionOrSymbol();
+}
+
+const ProgramStateRef processComparison(ProgramStateRef State,
+                                        RegionOrSymbol LVal,
+                                        RegionOrSymbol RVal, bool Equal) {
+  const auto *LPos = getIteratorPosition(State, LVal);
+  const auto *RPos = getIteratorPosition(State, RVal);
+  if (LPos && !RPos) {
+    State = adjustIteratorPosition(State, RVal, *LPos, Equal);
+  } else if (!LPos && RPos) {
+    State = adjustIteratorPosition(State, LVal, *RPos, Equal);
+  } else if (LPos && RPos) {
+    State = relateIteratorPositions(State, *LPos, *RPos, Equal);
+  }
+  return State;
+}
+
+const ProgramStateRef saveComparison(ProgramStateRef State,
+                                     const SymExpr *Condition, const SVal &LVal,
+                                     const SVal &RVal, bool Eq) {
+  const auto Left = getRegionOrSymbol(LVal);
+  const auto Right = getRegionOrSymbol(RVal);
+  if (!Left || !Right)
+    return State;
+  return State->set<IteratorComparisonMap>(Condition,
+                                           IteratorComparison(Left, Right, Eq));
+}
+
+const IteratorComparison *loadComparison(ProgramStateRef State,
+                                         const SymExpr *Condition) {
+  return State->get<IteratorComparisonMap>(Condition);
+}
+
+SymbolRef getContainerEnd(ProgramStateRef State, const MemRegion *Cont) {
+  const auto *CDataPtr = getContainerData(State, Cont);
+  if (!CDataPtr)
+    return nullptr;
+
+  return CDataPtr->getEnd();
+}
+
+ProgramStateRef createContainerEnd(ProgramStateRef State, const MemRegion *Cont,
+                                   const SymbolRef Sym) {
+  // Only create if it does not exist
+  const auto *CDataPtr = getContainerData(State, Cont);
+  if (CDataPtr) {
+    if (CDataPtr->getEnd()) {
+      return State;
+    } else {
+      const auto CData = CDataPtr->newEnd(Sym);
+      return setContainerData(State, Cont, CData);
+    }
+  } else {
+    const auto CData = ContainerData::fromEnd(Sym);
+    return setContainerData(State, Cont, CData);
+  }
+}
+
+const ContainerData *getContainerData(ProgramStateRef State,
+                                      const MemRegion *Cont) {
+  return State->get<ContainerMap>(Cont);
+}
+
+ProgramStateRef setContainerData(ProgramStateRef State, const MemRegion *Cont,
+                                 const ContainerData &CData) {
+  return State->set<ContainerMap>(Cont, CData);
+}
+
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            const SVal &Val) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return State->get<IteratorRegionMap>(Reg);
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return State->get<IteratorSymbolMap>(Sym);
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return State->get<IteratorRegionMap>(LCVal->getRegion());
+  }
+  return nullptr;
+}
+
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            RegionOrSymbol RegOrSym) {
+  if (RegOrSym.is<const MemRegion *>()) {
+    return State->get<IteratorRegionMap>(RegOrSym.get<const MemRegion *>());
+  } else if (RegOrSym.is<SymbolRef>()) {
+    return State->get<IteratorSymbolMap>(RegOrSym.get<SymbolRef>());
+  }
+  return nullptr;
+}
+
+ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
+                                    const IteratorPosition &Pos) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return State->set<IteratorRegionMap>(Reg, Pos);
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return State->set<IteratorSymbolMap>(Sym, Pos);
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return State->set<IteratorRegionMap>(LCVal->getRegion(), Pos);
+  }
+  return nullptr;
+}
+
+ProgramStateRef setIteratorPosition(ProgramStateRef State,
+                                    RegionOrSymbol RegOrSym,
+                                    const IteratorPosition &Pos) {
+  if (RegOrSym.is<const MemRegion *>()) {
+    return State->set<IteratorRegionMap>(RegOrSym.get<const MemRegion *>(),
+                                         Pos);
+  } else if (RegOrSym.is<SymbolRef>()) {
+    return State->set<IteratorSymbolMap>(RegOrSym.get<SymbolRef>(), Pos);
+  }
+  return nullptr;
+}
+
+ProgramStateRef removeIteratorPosition(ProgramStateRef State, const SVal &Val) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return State->remove<IteratorRegionMap>(Reg);
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return State->remove<IteratorSymbolMap>(Sym);
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return State->remove<IteratorRegionMap>(LCVal->getRegion());
+  }
+  return nullptr;
+}
+
+ProgramStateRef adjustIteratorPosition(ProgramStateRef State,
+                                       RegionOrSymbol RegOrSym,
+                                       const IteratorPosition &Pos,
+                                       bool Equal) {
+  if (Equal) {
+    return setIteratorPosition(State, RegOrSym, Pos);
+  } else {
+    return State;
+  }
+}
+
+ProgramStateRef relateIteratorPositions(ProgramStateRef State,
+                                        const IteratorPosition &Pos1,
+                                        const IteratorPosition &Pos2,
+                                        bool Equal) {
+  // Try to compare them and get a defined value
+  auto &SVB = State->getStateManager().getSValBuilder();
+  const auto comparison =
+      SVB.evalBinOp(State, BO_EQ, nonloc::SymbolVal(Pos1.getOffset()),
+                    nonloc::SymbolVal(Pos2.getOffset()), SVB.getConditionType())
+          .getAs<DefinedSVal>();
+  if (comparison) {
+    return State->assume(*comparison, Equal);
+  }
+
+  return State;
+}
+
+bool isOutOfRange(ProgramStateRef State, const IteratorPosition &Pos) {
+  const auto *Cont = Pos.getContainer();
+  const auto *CData = getContainerData(State, Cont);
+  if (!CData)
+    return false;
+
+  // Out of range means less than the begin symbol or greater or equal to the
+  // end symbol.
+
+  const auto End = CData->getEnd();
+  if (End) {
+    if (isGreaterOrEqual(State, Pos.getOffset(), End)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool isGreaterOrEqual(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2) {
+  return compare(State, Sym1, Sym2, BO_GE);
+}
+
+bool compare(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2,
+             BinaryOperator::Opcode Opc) {
+  auto &SMgr = State->getStateManager();
+  auto &SVB = SMgr.getSValBuilder();
+
+  const auto comparison =
+      SVB.evalBinOp(State, Opc, nonloc::SymbolVal(Sym1),
+                    nonloc::SymbolVal(Sym2), SVB.getConditionType())
+          .getAs<DefinedSVal>();
+
+  if(comparison) {
+    return !!State->assume(*comparison, true);
+  }
+
+  return false;
+}
+
+} // namespace
+
+#define REGISTER_CHECKER(name)                                                 \
+  void ento::register##name(CheckerManager &Mgr) {                             \
+    auto *checker = Mgr.registerChecker<IteratorChecker>();                    \
+    checker->ChecksEnabled[IteratorChecker::CK_##name] = true;                 \
+    checker->CheckNames[IteratorChecker::CK_##name] =                          \
+        Mgr.getCurrentCheckName();                                             \
+  }
+
+REGISTER_CHECKER(IteratorRangeChecker)
diff --git a/lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp b/lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp
deleted file mode 100644
index f0f7c98c9640..000000000000
--- a/lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp
+++ /dev/null
@@ -1,840 +0,0 @@
-//===-- IteratorPastEndChecker.cpp --------------------------------*- C++ -*--//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Defines a checker for using iterators outside their range (past end). Usage
-// means here dereferencing, incrementing etc.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ClangSACheckers.h"
-#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
-#include "clang/StaticAnalyzer/Core/Checker.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
-
-#include <utility>
-
-using namespace clang;
-using namespace ento;
-
-namespace {
-struct IteratorPosition {
-private:
-  enum Kind { InRange, OutofRange } K;
-  IteratorPosition(Kind InK) : K(InK) {}
-
-public:
-  bool isInRange() const { return K == InRange; }
-  bool isOutofRange() const { return K == OutofRange; }
-
-  static IteratorPosition getInRange() { return IteratorPosition(InRange); }
-  static IteratorPosition getOutofRange() {
-    return IteratorPosition(OutofRange);
-  }
-
-  bool operator==(const IteratorPosition &X) const { return K == X.K; }
-  bool operator!=(const IteratorPosition &X) const { return K != X.K; }
-  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(K); }
-};
-
-typedef llvm::PointerUnion<const MemRegion *, SymbolRef> RegionOrSymbol;
-
-struct IteratorComparison {
-private:
-  RegionOrSymbol Left, Right;
-  bool Equality;
-
-public:
-  IteratorComparison(RegionOrSymbol L, RegionOrSymbol R, bool Eq)
-      : Left(L), Right(R), Equality(Eq) {}
-
-  RegionOrSymbol getLeft() const { return Left; }
-  RegionOrSymbol getRight() const { return Right; }
-  bool isEquality() const { return Equality; }
-  bool operator==(const IteratorComparison &X) const {
-    return Left == X.Left && Right == X.Right && Equality == X.Equality;
-  }
-  bool operator!=(const IteratorComparison &X) const {
-    return Left != X.Left || Right != X.Right || Equality != X.Equality;
-  }
-  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(Equality); }
-};
-
-class IteratorPastEndChecker
-    : public Checker<
-          check::PreCall, check::PostCall, check::PreStmt<CXXOperatorCallExpr>,
-          check::PostStmt<CXXConstructExpr>, check::PostStmt<DeclStmt>,
-          check::PostStmt<MaterializeTemporaryExpr>, check::BeginFunction,
-          check::DeadSymbols, eval::Assume, eval::Call> {
-  mutable IdentifierInfo *II_find = nullptr,
-                         *II_find_end = nullptr, *II_find_first_of = nullptr,
-                         *II_find_if = nullptr, *II_find_if_not = nullptr,
-                         *II_lower_bound = nullptr, *II_upper_bound = nullptr,
-                         *II_search = nullptr, *II_search_n = nullptr;
-
-  std::unique_ptr<BugType> PastEndBugType;
-
-  void handleComparison(CheckerContext &C, const SVal &RetVal, const SVal &LVal,
-                        const SVal &RVal, OverloadedOperatorKind Op) const;
-  void handleAccess(CheckerContext &C, const SVal &Val) const;
-  void handleDecrement(CheckerContext &C, const SVal &Val) const;
-  void handleEnd(CheckerContext &C, const SVal &RetVal) const;
-
-  bool evalFind(CheckerContext &C, const CallExpr *CE) const;
-  bool evalFindEnd(CheckerContext &C, const CallExpr *CE) const;
-  bool evalFindFirstOf(CheckerContext &C, const CallExpr *CE) const;
-  bool evalFindIf(CheckerContext &C, const CallExpr *CE) const;
-  bool evalFindIfNot(CheckerContext &C, const CallExpr *CE) const;
-  bool evalLowerBound(CheckerContext &C, const CallExpr *CE) const;
-  bool evalUpperBound(CheckerContext &C, const CallExpr *CE) const;
-  bool evalSearch(CheckerContext &C, const CallExpr *CE) const;
-  bool evalSearchN(CheckerContext &C, const CallExpr *CE) const;
-  void Find(CheckerContext &C, const CallExpr *CE) const;
-
-  void reportPastEndBug(const StringRef &Message, const SVal &Val,
-                        CheckerContext &C, ExplodedNode *ErrNode) const;
-  void initIdentifiers(ASTContext &Ctx) const;
-
-public:
-  IteratorPastEndChecker();
-
-  void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
-  void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
-  void checkPreStmt(const CXXOperatorCallExpr *COCE, CheckerContext &C) const;
-  void checkBeginFunction(CheckerContext &C) const;
-  void checkPostStmt(const CXXConstructExpr *CCE, CheckerContext &C) const;
-  void checkPostStmt(const DeclStmt *DS, CheckerContext &C) const;
-  void checkPostStmt(const MaterializeTemporaryExpr *MTE,
-                     CheckerContext &C) const;
-  void checkDeadSymbols(SymbolReaper &SR, CheckerContext &C) const;
-  ProgramStateRef evalAssume(ProgramStateRef State, SVal Cond,
-                             bool Assumption) const;
-  bool evalCall(const CallExpr *CE, CheckerContext &C) const;
-};
-}
-
-REGISTER_MAP_WITH_PROGRAMSTATE(IteratorSymbolMap, SymbolRef, IteratorPosition)
-REGISTER_MAP_WITH_PROGRAMSTATE(IteratorRegionMap, const MemRegion *,
-                               IteratorPosition)
-
-REGISTER_MAP_WITH_PROGRAMSTATE(IteratorComparisonMap, const SymExpr *,
-                               IteratorComparison)
-
-#define INIT_ID(Id)                                                            \
-  if (!II_##Id)                                                                \
-  II_##Id = &Ctx.Idents.get(#Id)
-
-namespace {
-
-bool isIteratorType(const QualType &Type);
-bool isIterator(const CXXRecordDecl *CRD);
-bool isEndCall(const FunctionDecl *Func);
-bool isSimpleComparisonOperator(OverloadedOperatorKind OK);
-bool isAccessOperator(OverloadedOperatorKind OK);
-bool isDecrementOperator(OverloadedOperatorKind OK);
-BinaryOperator::Opcode getOpcode(const SymExpr *SE);
-const RegionOrSymbol getRegionOrSymbol(const SVal &Val);
-const ProgramStateRef processComparison(ProgramStateRef State,
-                                        RegionOrSymbol LVal,
-                                        RegionOrSymbol RVal, bool Equal);
-const ProgramStateRef saveComparison(ProgramStateRef State,
-                                     const SymExpr *Condition, const SVal &LVal,
-                                     const SVal &RVal, bool Eq);
-const IteratorComparison *loadComparison(ProgramStateRef State,
-                                         const SymExpr *Condition);
-const IteratorPosition *getIteratorPosition(ProgramStateRef State,
-                                            const SVal &Val);
-const IteratorPosition *getIteratorPosition(ProgramStateRef State,
-                                            RegionOrSymbol RegOrSym);
-ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
-                                    IteratorPosition Pos);
-ProgramStateRef setIteratorPosition(ProgramStateRef State,
-                                    RegionOrSymbol RegOrSym,
-                                    IteratorPosition Pos);
-ProgramStateRef adjustIteratorPosition(ProgramStateRef State,
-                                       RegionOrSymbol RegOrSym,
-                                       IteratorPosition Pos, bool Equal);
-bool contradictingIteratorPositions(IteratorPosition Pos1,
-                                    IteratorPosition Pos2, bool Equal);
-}
-
-IteratorPastEndChecker::IteratorPastEndChecker() {
-  PastEndBugType.reset(
-      new BugType(this, "Iterator Past End", "Misuse of STL APIs"));
-  PastEndBugType->setSuppressOnSink(true);
-}
-
-void IteratorPastEndChecker::checkPreCall(const CallEvent &Call,
-                                          CheckerContext &C) const {
-  // Check for access past end
-  const auto *Func = Call.getDecl()->getAsFunction();
-  if (!Func)
-    return;
-  if (Func->isOverloadedOperator()) {
-    if (isAccessOperator(Func->getOverloadedOperator())) {
-      if (const auto *InstCall = dyn_cast<CXXInstanceCall>(&Call)) {
-        handleAccess(C, InstCall->getCXXThisVal());
-      } else {
-        handleAccess(C, Call.getArgSVal(0));
-      }
-    }
-  }
-}
-
-void IteratorPastEndChecker::checkPostCall(const CallEvent &Call,
-                                           CheckerContext &C) const {
-  // Record end() iterators, iterator decrementation and comparison
-  const auto *Func = Call.getDecl()->getAsFunction();
-  if (!Func)
-    return;
-  if (Func->isOverloadedOperator()) {
-    const auto Op = Func->getOverloadedOperator();
-    if (isSimpleComparisonOperator(Op)) {
-      if (Func->isCXXInstanceMember()) {
-        const auto &InstCall = static_cast<const CXXInstanceCall &>(Call);
-        handleComparison(C, InstCall.getReturnValue(), InstCall.getCXXThisVal(),
-                         InstCall.getArgSVal(0), Op);
-      } else {
-        handleComparison(C, Call.getReturnValue(), Call.getArgSVal(0),
-                         Call.getArgSVal(1), Op);
-      }
-    } else if (isDecrementOperator(Func->getOverloadedOperator())) {
-      if (Func->isCXXInstanceMember()) {
-        const auto &InstCall = static_cast<const CXXInstanceCall &>(Call);
-        handleDecrement(C, InstCall.getCXXThisVal());
-      } else {
-        handleDecrement(C, Call.getArgSVal(0));
-      }
-    }
-  } else if (Func->isCXXInstanceMember()) {
-    if (!isEndCall(Func))
-      return;
-    if (!isIteratorType(Call.getResultType()))
-      return;
-    handleEnd(C, Call.getReturnValue());
-  }
-}
-
-void IteratorPastEndChecker::checkPreStmt(const CXXOperatorCallExpr *COCE,
-                                          CheckerContext &C) const {
-  const auto *ThisExpr = COCE->getArg(0);
-
-  auto State = C.getState();
-  const auto *LCtx = C.getPredecessor()->getLocationContext();
-
-  const auto CurrentThis = State->getSVal(ThisExpr, LCtx);
-  if (const auto *Reg = CurrentThis.getAsRegion()) {
-    if (!Reg->getAs<CXXTempObjectRegion>())
-      return;
-    const auto OldState = C.getPredecessor()->getFirstPred()->getState();
-    const auto OldThis = OldState->getSVal(ThisExpr, LCtx);
-    const auto *Pos = getIteratorPosition(OldState, OldThis);
-    if (!Pos)
-      return;
-    State = setIteratorPosition(State, CurrentThis, *Pos);
-    C.addTransition(State);
-  }
-}
-
-void IteratorPastEndChecker::checkBeginFunction(CheckerContext &C) const {
-  // Copy state of iterator arguments to iterator parameters
-  auto State = C.getState();
-  const auto *LCtx = C.getLocationContext();
-
-  const auto *Site = cast<StackFrameContext>(LCtx)->getCallSite();
-  if (!Site)
-    return;
-
-  const auto *FD = dyn_cast<FunctionDecl>(LCtx->getDecl());
-  if (!FD)
-    return;
-
-  const auto *CE = dyn_cast<CallExpr>(Site);
-  if (!CE)
-    return;
-
-  bool Change = false;
-  int idx = 0;
-  for (const auto P : FD->parameters()) {
-    auto Param = State->getLValue(P, LCtx);
-    auto Arg = State->getSVal(CE->getArg(idx++), LCtx->getParent());
-    const auto *Pos = getIteratorPosition(State, Arg);
-    if (!Pos)
-      continue;
-    State = setIteratorPosition(State, Param, *Pos);
-    Change = true;
-  }
-  if (Change) {
-    C.addTransition(State);
-  }
-}
-
-void IteratorPastEndChecker::checkPostStmt(const CXXConstructExpr *CCE,
-                                           CheckerContext &C) const {
-  // Transfer iterator state in case of copy or move by constructor
-  const auto *ctr = CCE->getConstructor();
-  if (!ctr->isCopyOrMoveConstructor())
-    return;
-  const auto *RHSExpr = CCE->getArg(0);
-
-  auto State = C.getState();
-  const auto *LCtx = C.getLocationContext();
-
-  const auto RetVal = State->getSVal(CCE, LCtx);
-
-  const auto RHSVal = State->getSVal(RHSExpr, LCtx);
-  const auto *RHSPos = getIteratorPosition(State, RHSVal);
-  if (!RHSPos)
-    return;
-  State = setIteratorPosition(State, RetVal, *RHSPos);
-  C.addTransition(State);
-}
-
-void IteratorPastEndChecker::checkPostStmt(const DeclStmt *DS,
-                                           CheckerContext &C) const {
-  // Transfer iterator state to new variable declaration
-  for (const auto *D : DS->decls()) {
-    const auto *VD = dyn_cast<VarDecl>(D);
-    if (!VD || !VD->hasInit())
-      continue;
-
-    auto State = C.getState();
-    const auto *LCtx = C.getPredecessor()->getLocationContext();
-    const auto *Pos =
-        getIteratorPosition(State, State->getSVal(VD->getInit(), LCtx));
-    if (!Pos)
-      continue;
-    State = setIteratorPosition(State, State->getLValue(VD, LCtx), *Pos);
-    C.addTransition(State);
-  }
-}
-
-void IteratorPastEndChecker::checkPostStmt(const MaterializeTemporaryExpr *MTE,
-                                           CheckerContext &C) const {
-  /* Transfer iterator state for to temporary objects */
-  auto State = C.getState();
-  const auto *LCtx = C.getPredecessor()->getLocationContext();
-  const auto *Pos =
-      getIteratorPosition(State, State->getSVal(MTE->GetTemporaryExpr(), LCtx));
-  if (!Pos)
-    return;
-  State = setIteratorPosition(State, State->getSVal(MTE, LCtx), *Pos);
-  C.addTransition(State);
-}
-
-void IteratorPastEndChecker::checkDeadSymbols(SymbolReaper &SR,
-                                              CheckerContext &C) const {
-  auto State = C.getState();
-
-  auto RegionMap = State->get<IteratorRegionMap>();
-  for (const auto Reg : RegionMap) {
-    if (!SR.isLiveRegion(Reg.first)) {
-      State = State->remove<IteratorRegionMap>(Reg.first);
-    }
-  }
-
-  auto SymbolMap = State->get<IteratorSymbolMap>();
-  for (const auto Sym : SymbolMap) {
-    if (SR.isDead(Sym.first)) {
-      State = State->remove<IteratorSymbolMap>(Sym.first);
-    }
-  }
-
-  auto ComparisonMap = State->get<IteratorComparisonMap>();
-  for (const auto Comp : ComparisonMap) {
-    if (SR.isDead(Comp.first)) {
-      State = State->remove<IteratorComparisonMap>(Comp.first);
-    }
-  }
-}
-
-ProgramStateRef IteratorPastEndChecker::evalAssume(ProgramStateRef State,
-                                                   SVal Cond,
-                                                   bool Assumption) const {
-  // Load recorded comparison and transfer iterator state between sides
-  // according to comparison operator and assumption
-  const auto *SE = Cond.getAsSymExpr();
-  if (!SE)
-    return State;
-
-  auto Opc = getOpcode(SE);
-  if (Opc != BO_EQ && Opc != BO_NE)
-    return State;
-
-  bool Negated = false;
-  const auto *Comp = loadComparison(State, SE);
-  if (!Comp) {
-    // Try negated comparison, which is a SymExpr to 0 integer comparison
-    const auto *SIE = dyn_cast<SymIntExpr>(SE);
-    if (!SIE)
-      return State;
-
-    if (SIE->getRHS() != 0)
-      return State;
-
-    SE = SIE->getLHS();
-    Negated = SIE->getOpcode() == BO_EQ; // Equal to zero means negation
-    Opc = getOpcode(SE);
-    if (Opc != BO_EQ && Opc != BO_NE)
-      return State;
-
-    Comp = loadComparison(State, SE);
-    if (!Comp)
-      return State;
-  }
-
-  return processComparison(State, Comp->getLeft(), Comp->getRight(),
-                           (Comp->isEquality() == Assumption) != Negated);
-}
-
-// FIXME: Evaluation of these STL calls should be moved to StdCLibraryFunctions
-//       checker (see patch r284960) or another similar checker for C++ STL
-//       functions (e.g. StdCXXLibraryFunctions or StdCppLibraryFunctions).
-bool IteratorPastEndChecker::evalCall(const CallExpr *CE,
-                                      CheckerContext &C) const {
-  const FunctionDecl *FD = C.getCalleeDecl(CE);
-  if (!FD)
-    return false;
-
-  ASTContext &Ctx = C.getASTContext();
-  initIdentifiers(Ctx);
-
-  if (FD->getKind() == Decl::Function) {
-    if (FD->isInStdNamespace()) {
-      if (FD->getIdentifier() == II_find) {
-        return evalFind(C, CE);
-      } else if (FD->getIdentifier() == II_find_end) {
-        return evalFindEnd(C, CE);
-      } else if (FD->getIdentifier() == II_find_first_of) {
-        return evalFindFirstOf(C, CE);
-      } else if (FD->getIdentifier() == II_find_if) {
-        return evalFindIf(C, CE);
-      } else if (FD->getIdentifier() == II_find_if_not) {
-        return evalFindIfNot(C, CE);
-      } else if (FD->getIdentifier() == II_upper_bound) {
-        return evalUpperBound(C, CE);
-      } else if (FD->getIdentifier() == II_lower_bound) {
-        return evalLowerBound(C, CE);
-      } else if (FD->getIdentifier() == II_search) {
-        return evalSearch(C, CE);
-      } else if (FD->getIdentifier() == II_search_n) {
-        return evalSearchN(C, CE);
-      }
-    }
-  }
-
-  return false;
-}
-
-void IteratorPastEndChecker::handleComparison(CheckerContext &C,
-                                              const SVal &RetVal,
-                                              const SVal &LVal,
-                                              const SVal &RVal,
-                                              OverloadedOperatorKind Op) const {
-  // Record the operands and the operator of the comparison for the next
-  // evalAssume, if the result is a symbolic expression. If it is a concrete
-  // value (only one branch is possible), then transfer the state between
-  // the operands according to the operator and the result
-  auto State = C.getState();
-  if (const auto *Condition = RetVal.getAsSymbolicExpression()) {
-    const auto *LPos = getIteratorPosition(State, LVal);
-    const auto *RPos = getIteratorPosition(State, RVal);
-    if (!LPos && !RPos)
-      return;
-    State = saveComparison(State, Condition, LVal, RVal, Op == OO_EqualEqual);
-    C.addTransition(State);
-  } else if (const auto TruthVal = RetVal.getAs<nonloc::ConcreteInt>()) {
-    if ((State = processComparison(
-             State, getRegionOrSymbol(LVal), getRegionOrSymbol(RVal),
-             (Op == OO_EqualEqual) == (TruthVal->getValue() != 0)))) {
-      C.addTransition(State);
-    } else {
-      C.generateSink(State, C.getPredecessor());
-    }
-  }
-}
-
-void IteratorPastEndChecker::handleAccess(CheckerContext &C,
-                                          const SVal &Val) const {
-  auto State = C.getState();
-  const auto *Pos = getIteratorPosition(State, Val);
-  if (Pos && Pos->isOutofRange()) {
-    auto *N = C.generateNonFatalErrorNode(State);
-    if (!N) {
-      return;
-    }
-    reportPastEndBug("Iterator accessed past its end.", Val, C, N);
-  }
-}
-
-void IteratorPastEndChecker::handleDecrement(CheckerContext &C,
-                                             const SVal &Val) const {
-  auto State = C.getState();
-  const auto *Pos = getIteratorPosition(State, Val);
-  if (Pos && Pos->isOutofRange()) {
-    State = setIteratorPosition(State, Val, IteratorPosition::getInRange());
-    // FIXME: We could also check for iterators ahead of their beginnig in the
-    //       future, but currently we do not care for such errors. We also
-    //       assume that the iterator is not past its end by more then one
-    //       position.
-    C.addTransition(State);
-  }
-}
-
-void IteratorPastEndChecker::handleEnd(CheckerContext &C,
-                                       const SVal &RetVal) const {
-  auto State = C.getState();
-  State = setIteratorPosition(State, RetVal, IteratorPosition::getOutofRange());
-  C.addTransition(State);
-}
-
-bool IteratorPastEndChecker::evalFind(CheckerContext &C,
-                                      const CallExpr *CE) const {
-  if (CE->getNumArgs() == 3 && isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalFindEnd(CheckerContext &C,
-                                         const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType()) &&
-      isIteratorType(CE->getArg(2)->getType()) &&
-      isIteratorType(CE->getArg(3)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalFindFirstOf(CheckerContext &C,
-                                             const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType()) &&
-      isIteratorType(CE->getArg(2)->getType()) &&
-      isIteratorType(CE->getArg(3)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalFindIf(CheckerContext &C,
-                                        const CallExpr *CE) const {
-  if (CE->getNumArgs() == 3 && isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalFindIfNot(CheckerContext &C,
-                                           const CallExpr *CE) const {
-  if (CE->getNumArgs() == 3 && isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalLowerBound(CheckerContext &C,
-                                            const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 3 || CE->getNumArgs() == 4) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalUpperBound(CheckerContext &C,
-                                            const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 3 || CE->getNumArgs() == 4) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalSearch(CheckerContext &C,
-                                        const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType()) &&
-      isIteratorType(CE->getArg(2)->getType()) &&
-      isIteratorType(CE->getArg(3)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalSearchN(CheckerContext &C,
-                                         const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-void IteratorPastEndChecker::Find(CheckerContext &C, const CallExpr *CE) const {
-  auto state = C.getState();
-  auto &svalBuilder = C.getSValBuilder();
-  const auto *LCtx = C.getLocationContext();
-
-  auto RetVal = svalBuilder.conjureSymbolVal(nullptr, CE, LCtx, C.blockCount());
-  auto SecondParam = state->getSVal(CE->getArg(1), LCtx);
-
-  auto stateFound = state->BindExpr(CE, LCtx, RetVal);
-  auto stateNotFound = state->BindExpr(CE, LCtx, SecondParam);
-
-  C.addTransition(stateFound);
-  C.addTransition(stateNotFound);
-}
-
-void IteratorPastEndChecker::reportPastEndBug(const StringRef &Message,
-                                              const SVal &Val,
-                                              CheckerContext &C,
-                                              ExplodedNode *ErrNode) const {
-  auto R = llvm::make_unique<BugReport>(*PastEndBugType, Message, ErrNode);
-  R->markInteresting(Val);
-  C.emitReport(std::move(R));
-}
-
-void IteratorPastEndChecker::initIdentifiers(ASTContext &Ctx) const {
-  INIT_ID(find);
-  INIT_ID(find_end);
-  INIT_ID(find_first_of);
-  INIT_ID(find_if);
-  INIT_ID(find_if_not);
-  INIT_ID(lower_bound);
-  INIT_ID(upper_bound);
-  INIT_ID(search);
-  INIT_ID(search_n);
-}
-
-namespace {
-
-bool isIteratorType(const QualType &Type) {
-  if (Type->isPointerType())
-    return true;
-
-  const auto *CRD = Type->getUnqualifiedDesugaredType()->getAsCXXRecordDecl();
-  return isIterator(CRD);
-}
-
-bool isIterator(const CXXRecordDecl *CRD) {
-  if (!CRD)
-    return false;
-
-  const auto Name = CRD->getName();
-  if (!(Name.endswith_lower("iterator") || Name.endswith_lower("iter") ||
-        Name.endswith_lower("it")))
-    return false;
-
-  bool HasCopyCtor = false, HasCopyAssign = true, HasDtor = false,
-       HasPreIncrOp = false, HasPostIncrOp = false, HasDerefOp = false;
-  for (const auto *Method : CRD->methods()) {
-    if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(Method)) {
-      if (Ctor->isCopyConstructor()) {
-        HasCopyCtor = !Ctor->isDeleted() && Ctor->getAccess() == AS_public;
-      }
-      continue;
-    }
-    if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(Method)) {
-      HasDtor = !Dtor->isDeleted() && Dtor->getAccess() == AS_public;
-      continue;
-    }
-    if (Method->isCopyAssignmentOperator()) {
-      HasCopyAssign = !Method->isDeleted() && Method->getAccess() == AS_public;
-      continue;
-    }
-    if (!Method->isOverloadedOperator())
-      continue;
-    const auto OPK = Method->getOverloadedOperator();
-    if (OPK == OO_PlusPlus) {
-      HasPreIncrOp = HasPreIncrOp || (Method->getNumParams() == 0);
-      HasPostIncrOp = HasPostIncrOp || (Method->getNumParams() == 1);
-      continue;
-    }
-    if (OPK == OO_Star) {
-      HasDerefOp = (Method->getNumParams() == 0);
-      continue;
-    }
-  }
-
-  return HasCopyCtor && HasCopyAssign && HasDtor && HasPreIncrOp &&
-         HasPostIncrOp && HasDerefOp;
-}
-
-bool isEndCall(const FunctionDecl *Func) {
-  const auto *IdInfo = Func->getIdentifier();
-  if (!IdInfo)
-    return false;
-  return IdInfo->getName().endswith_lower("end");
-}
-
-bool isSimpleComparisonOperator(OverloadedOperatorKind OK) {
-  return OK == OO_EqualEqual || OK == OO_ExclaimEqual;
-}
-
-bool isAccessOperator(OverloadedOperatorKind OK) {
-  return OK == OO_Star || OK == OO_Arrow || OK == OO_ArrowStar ||
-         OK == OO_Plus || OK == OO_PlusEqual || OK == OO_PlusPlus ||
-         OK == OO_Subscript;
-}
-
-bool isDecrementOperator(OverloadedOperatorKind OK) {
-  return OK == OO_MinusEqual || OK == OO_MinusMinus;
-}
-
-BinaryOperator::Opcode getOpcode(const SymExpr *SE) {
-  if (const auto *BSE = dyn_cast<BinarySymExpr>(SE)) {
-    return BSE->getOpcode();
-  } else if (const auto *SC = dyn_cast<SymbolConjured>(SE)) {
-    const auto *COE = dyn_cast<CXXOperatorCallExpr>(SC->getStmt());
-    if (!COE)
-      return BO_Comma; // Extremal value, neither EQ nor NE
-    if (COE->getOperator() == OO_EqualEqual) {
-      return BO_EQ;
-    } else if (COE->getOperator() == OO_ExclaimEqual) {
-      return BO_NE;
-    }
-    return BO_Comma; // Extremal value, neither EQ nor NE
-  }
-  return BO_Comma; // Extremal value, neither EQ nor NE
-}
-
-const RegionOrSymbol getRegionOrSymbol(const SVal &Val) {
-  if (const auto Reg = Val.getAsRegion()) {
-    return Reg;
-  } else if (const auto Sym = Val.getAsSymbol()) {
-    return Sym;
-  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
-    return LCVal->getRegion();
-  }
-  return RegionOrSymbol();
-}
-
-const ProgramStateRef processComparison(ProgramStateRef State,
-                                        RegionOrSymbol LVal,
-                                        RegionOrSymbol RVal, bool Equal) {
-  const auto *LPos = getIteratorPosition(State, LVal);
-  const auto *RPos = getIteratorPosition(State, RVal);
-  if (LPos && !RPos) {
-    State = adjustIteratorPosition(State, RVal, *LPos, Equal);
-  } else if (!LPos && RPos) {
-    State = adjustIteratorPosition(State, LVal, *RPos, Equal);
-  } else if (LPos && RPos) {
-    if (contradictingIteratorPositions(*LPos, *RPos, Equal)) {
-      return nullptr;
-    }
-  }
-  return State;
-}
-
-const ProgramStateRef saveComparison(ProgramStateRef State,
-                                     const SymExpr *Condition, const SVal &LVal,
-                                     const SVal &RVal, bool Eq) {
-  const auto Left = getRegionOrSymbol(LVal);
-  const auto Right = getRegionOrSymbol(RVal);
-  if (!Left || !Right)
-    return State;
-  return State->set<IteratorComparisonMap>(Condition,
-                                           IteratorComparison(Left, Right, Eq));
-}
-
-const IteratorComparison *loadComparison(ProgramStateRef State,
-                                         const SymExpr *Condition) {
-  return State->get<IteratorComparisonMap>(Condition);
-}
-
-const IteratorPosition *getIteratorPosition(ProgramStateRef State,
-                                            const SVal &Val) {
-  if (const auto Reg = Val.getAsRegion()) {
-    return State->get<IteratorRegionMap>(Reg);
-  } else if (const auto Sym = Val.getAsSymbol()) {
-    return State->get<IteratorSymbolMap>(Sym);
-  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
-    return State->get<IteratorRegionMap>(LCVal->getRegion());
-  }
-  return nullptr;
-}
-
-const IteratorPosition *getIteratorPosition(ProgramStateRef State,
-                                            RegionOrSymbol RegOrSym) {
-  if (RegOrSym.is<const MemRegion *>()) {
-    return State->get<IteratorRegionMap>(RegOrSym.get<const MemRegion *>());
-  } else if (RegOrSym.is<SymbolRef>()) {
-    return State->get<IteratorSymbolMap>(RegOrSym.get<SymbolRef>());
-  }
-  return nullptr;
-}
-
-ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
-                                    IteratorPosition Pos) {
-  if (const auto Reg = Val.getAsRegion()) {
-    return State->set<IteratorRegionMap>(Reg, Pos);
-  } else if (const auto Sym = Val.getAsSymbol()) {
-    return State->set<IteratorSymbolMap>(Sym, Pos);
-  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
-    return State->set<IteratorRegionMap>(LCVal->getRegion(), Pos);
-  }
-  return nullptr;
-}
-
-ProgramStateRef setIteratorPosition(ProgramStateRef State,
-                                    RegionOrSymbol RegOrSym,
-                                    IteratorPosition Pos) {
-  if (RegOrSym.is<const MemRegion *>()) {
-    return State->set<IteratorRegionMap>(RegOrSym.get<const MemRegion *>(),
-                                         Pos);
-  } else if (RegOrSym.is<SymbolRef>()) {
-    return State->set<IteratorSymbolMap>(RegOrSym.get<SymbolRef>(), Pos);
-  }
-  return nullptr;
-}
-
-ProgramStateRef adjustIteratorPosition(ProgramStateRef State,
-                                       RegionOrSymbol RegOrSym,
-                                       IteratorPosition Pos, bool Equal) {
-
-  if ((Pos.isInRange() && Equal) || (Pos.isOutofRange() && !Equal)) {
-    return setIteratorPosition(State, RegOrSym, IteratorPosition::getInRange());
-  } else if (Pos.isOutofRange() && Equal) {
-    return setIteratorPosition(State, RegOrSym,
-                               IteratorPosition::getOutofRange());
-  } else {
-    return State;
-  }
-}
-
-bool contradictingIteratorPositions(IteratorPosition Pos1,
-                                    IteratorPosition Pos2, bool Equal) {
-  return ((Pos1 != Pos2) && Equal) ||
-         ((Pos1.isOutofRange() && Pos2.isOutofRange()) && !Equal);
-}
-}
-
-void ento::registerIteratorPastEndChecker(CheckerManager &Mgr) {
-  Mgr.registerChecker<IteratorPastEndChecker>();
-}
diff --git a/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp b/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
index 7ef79c683c49..0e3a649e88f7 100644
--- a/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
@@ -25,7 +25,13 @@ using namespace ento;
 namespace {
 
 struct LockState {
-  enum Kind { Destroyed, Locked, Unlocked } K;
+  enum Kind {
+    Destroyed,
+    Locked,
+    Unlocked,
+    UntouchedAndPossiblyDestroyed,
+    UnlockedAndPossiblyDestroyed
+  } K;
 
 private:
   LockState(Kind K) : K(K) {}
@@ -34,6 +40,12 @@ public:
   static LockState getLocked() { return LockState(Locked); }
   static LockState getUnlocked() { return LockState(Unlocked); }
   static LockState getDestroyed() { return LockState(Destroyed); }
+  static LockState getUntouchedAndPossiblyDestroyed() {
+    return LockState(UntouchedAndPossiblyDestroyed);
+  }
+  static LockState getUnlockedAndPossiblyDestroyed() {
+    return LockState(UnlockedAndPossiblyDestroyed);
+  }
 
   bool operator==(const LockState &X) const {
     return K == X.K;
@@ -42,13 +54,20 @@ public:
   bool isLocked() const { return K == Locked; }
   bool isUnlocked() const { return K == Unlocked; }
   bool isDestroyed() const { return K == Destroyed; }
+  bool isUntouchedAndPossiblyDestroyed() const {
+    return K == UntouchedAndPossiblyDestroyed;
+  }
+  bool isUnlockedAndPossiblyDestroyed() const {
+    return K == UnlockedAndPossiblyDestroyed;
+  }
 
   void Profile(llvm::FoldingSetNodeID &ID) const {
     ID.AddInteger(K);
   }
 };
 
-class PthreadLockChecker : public Checker< check::PostStmt<CallExpr> > {
+class PthreadLockChecker
+    : public Checker<check::PostStmt<CallExpr>, check::DeadSymbols> {
   mutable std::unique_ptr<BugType> BT_doublelock;
   mutable std::unique_ptr<BugType> BT_doubleunlock;
   mutable std::unique_ptr<BugType> BT_destroylock;
@@ -61,22 +80,31 @@ class PthreadLockChecker : public Checker< check::PostStmt<CallExpr> > {
   };
 public:
   void checkPostStmt(const CallExpr *CE, CheckerContext &C) const;
+  void checkDeadSymbols(SymbolReaper &SymReaper, CheckerContext &C) const;
 
   void AcquireLock(CheckerContext &C, const CallExpr *CE, SVal lock,
                    bool isTryLock, enum LockingSemantics semantics) const;
 
   void ReleaseLock(CheckerContext &C, const CallExpr *CE, SVal lock) const;
-  void DestroyLock(CheckerContext &C, const CallExpr *CE, SVal Lock) const;
+  void DestroyLock(CheckerContext &C, const CallExpr *CE, SVal Lock,
+                   enum LockingSemantics semantics) const;
   void InitLock(CheckerContext &C, const CallExpr *CE, SVal Lock) const;
   void reportUseDestroyedBug(CheckerContext &C, const CallExpr *CE) const;
+  ProgramStateRef resolvePossiblyDestroyedMutex(ProgramStateRef state,
+                                                const MemRegion *lockR,
+                                                const SymbolRef *sym) const;
 };
 } // end anonymous namespace
 
-// GDM Entry for tracking lock state.
+// A stack of locks for tracking lock-unlock order.
 REGISTER_LIST_WITH_PROGRAMSTATE(LockSet, const MemRegion *)
 
+// An entry for tracking lock states.
 REGISTER_MAP_WITH_PROGRAMSTATE(LockMap, const MemRegion *, LockState)
 
+// Return values for unresolved calls to pthread_mutex_destroy().
+REGISTER_MAP_WITH_PROGRAMSTATE(DestroyRetVal, const MemRegion *, SymbolRef)
+
 void PthreadLockChecker::checkPostStmt(const CallExpr *CE,
                                        CheckerContext &C) const {
   ProgramStateRef state = C.getState();
@@ -113,13 +141,49 @@ void PthreadLockChecker::checkPostStmt(const CallExpr *CE,
            FName == "lck_mtx_unlock" ||
            FName == "lck_rw_done")
     ReleaseLock(C, CE, state->getSVal(CE->getArg(0), LCtx));
-  else if (FName == "pthread_mutex_destroy" ||
-           FName == "lck_mtx_destroy")
-    DestroyLock(C, CE, state->getSVal(CE->getArg(0), LCtx));
+  else if (FName == "pthread_mutex_destroy")
+    DestroyLock(C, CE, state->getSVal(CE->getArg(0), LCtx), PthreadSemantics);
+  else if (FName == "lck_mtx_destroy")
+    DestroyLock(C, CE, state->getSVal(CE->getArg(0), LCtx), XNUSemantics);
   else if (FName == "pthread_mutex_init")
     InitLock(C, CE, state->getSVal(CE->getArg(0), LCtx));
 }
 
+// When a lock is destroyed, in some semantics(like PthreadSemantics) we are not
+// sure if the destroy call has succeeded or failed, and the lock enters one of
+// the 'possibly destroyed' state. There is a short time frame for the
+// programmer to check the return value to see if the lock was successfully
+// destroyed. Before we model the next operation over that lock, we call this
+// function to see if the return value was checked by now and set the lock state
+// - either to destroyed state or back to its previous state.
+
+// In PthreadSemantics, pthread_mutex_destroy() returns zero if the lock is
+// successfully destroyed and it returns a non-zero value otherwise.
+ProgramStateRef PthreadLockChecker::resolvePossiblyDestroyedMutex(
+    ProgramStateRef state, const MemRegion *lockR, const SymbolRef *sym) const {
+  const LockState *lstate = state->get<LockMap>(lockR);
+  // Existence in DestroyRetVal ensures existence in LockMap.
+  // Existence in Destroyed also ensures that the lock state for lockR is either
+  // UntouchedAndPossiblyDestroyed or UnlockedAndPossiblyDestroyed.
+  assert(lstate->isUntouchedAndPossiblyDestroyed() ||
+         lstate->isUnlockedAndPossiblyDestroyed());
+
+  ConstraintManager &CMgr = state->getConstraintManager();
+  ConditionTruthVal retZero = CMgr.isNull(state, *sym);
+  if (retZero.isConstrainedFalse()) {
+    if (lstate->isUntouchedAndPossiblyDestroyed())
+      state = state->remove<LockMap>(lockR);
+    else if (lstate->isUnlockedAndPossiblyDestroyed())
+      state = state->set<LockMap>(lockR, LockState::getUnlocked());
+  } else
+    state = state->set<LockMap>(lockR, LockState::getDestroyed());
+
+  // Removing the map entry (lockR, sym) from DestroyRetVal as the lock state is
+  // now resolved.
+  state = state->remove<DestroyRetVal>(lockR);
+  return state;
+}
+
 void PthreadLockChecker::AcquireLock(CheckerContext &C, const CallExpr *CE,
                                      SVal lock, bool isTryLock,
                                      enum LockingSemantics semantics) const {
@@ -129,6 +193,9 @@ void PthreadLockChecker::AcquireLock(CheckerContext &C, const CallExpr *CE,
     return;
 
   ProgramStateRef state = C.getState();
+  const SymbolRef *sym = state->get<DestroyRetVal>(lockR);
+  if (sym)
+    state = resolvePossiblyDestroyedMutex(state, lockR, sym);
 
   SVal X = state->getSVal(CE, C.getLocationContext());
   if (X.isUnknownOrUndef())
@@ -197,6 +264,9 @@ void PthreadLockChecker::ReleaseLock(CheckerContext &C, const CallExpr *CE,
     return;
 
   ProgramStateRef state = C.getState();
+  const SymbolRef *sym = state->get<DestroyRetVal>(lockR);
+  if (sym)
+    state = resolvePossiblyDestroyedMutex(state, lockR, sym);
 
   if (const LockState *LState = state->get<LockMap>(lockR)) {
     if (LState->isUnlocked()) {
@@ -245,7 +315,8 @@ void PthreadLockChecker::ReleaseLock(CheckerContext &C, const CallExpr *CE,
 }
 
 void PthreadLockChecker::DestroyLock(CheckerContext &C, const CallExpr *CE,
-                                     SVal Lock) const {
+                                     SVal Lock,
+                                     enum LockingSemantics semantics) const {
 
   const MemRegion *LockR = Lock.getAsRegion();
   if (!LockR)
@@ -253,13 +324,38 @@ void PthreadLockChecker::DestroyLock(CheckerContext &C, const CallExpr *CE,
 
   ProgramStateRef State = C.getState();
 
+  const SymbolRef *sym = State->get<DestroyRetVal>(LockR);
+  if (sym)
+    State = resolvePossiblyDestroyedMutex(State, LockR, sym);
+
   const LockState *LState = State->get<LockMap>(LockR);
-  if (!LState || LState->isUnlocked()) {
-    State = State->set<LockMap>(LockR, LockState::getDestroyed());
-    C.addTransition(State);
-    return;
+  // Checking the return value of the destroy method only in the case of
+  // PthreadSemantics
+  if (semantics == PthreadSemantics) {
+    if (!LState || LState->isUnlocked()) {
+      SymbolRef sym = C.getSVal(CE).getAsSymbol();
+      if (!sym) {
+        State = State->remove<LockMap>(LockR);
+        C.addTransition(State);
+        return;
+      }
+      State = State->set<DestroyRetVal>(LockR, sym);
+      if (LState && LState->isUnlocked())
+        State = State->set<LockMap>(
+            LockR, LockState::getUnlockedAndPossiblyDestroyed());
+      else
+        State = State->set<LockMap>(
+            LockR, LockState::getUntouchedAndPossiblyDestroyed());
+      C.addTransition(State);
+      return;
+    }
+  } else {
+    if (!LState || LState->isUnlocked()) {
+      State = State->set<LockMap>(LockR, LockState::getDestroyed());
+      C.addTransition(State);
+      return;
+    }
   }
-
   StringRef Message;
 
   if (LState->isLocked()) {
@@ -288,6 +384,10 @@ void PthreadLockChecker::InitLock(CheckerContext &C, const CallExpr *CE,
 
   ProgramStateRef State = C.getState();
 
+  const SymbolRef *sym = State->get<DestroyRetVal>(LockR);
+  if (sym)
+    State = resolvePossiblyDestroyedMutex(State, LockR, sym);
+
   const struct LockState *LState = State->get<LockMap>(LockR);
   if (!LState || LState->isDestroyed()) {
     State = State->set<LockMap>(LockR, LockState::getUnlocked());
@@ -328,6 +428,26 @@ void PthreadLockChecker::reportUseDestroyedBug(CheckerContext &C,
   C.emitReport(std::move(Report));
 }
 
+void PthreadLockChecker::checkDeadSymbols(SymbolReaper &SymReaper,
+                                          CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+
+  // TODO: Clean LockMap when a mutex region dies.
+
+  DestroyRetValTy TrackedSymbols = State->get<DestroyRetVal>();
+  for (DestroyRetValTy::iterator I = TrackedSymbols.begin(),
+                                 E = TrackedSymbols.end();
+       I != E; ++I) {
+    const SymbolRef Sym = I->second;
+    const MemRegion *lockR = I->first;
+    bool IsSymDead = SymReaper.isDead(Sym);
+    // Remove the dead symbol from the return value symbols map.
+    if (IsSymDead)
+      State = resolvePossiblyDestroyedMutex(State, lockR, &Sym);
+  }
+  C.addTransition(State);
+}
+
 void ento::registerPthreadLockChecker(CheckerManager &mgr) {
   mgr.registerChecker<PthreadLockChecker>();
 }
diff --git a/lib/StaticAnalyzer/Core/ProgramState.cpp b/lib/StaticAnalyzer/Core/ProgramState.cpp
index 31556c792fc5..3215c3ccd21e 100644
--- a/lib/StaticAnalyzer/Core/ProgramState.cpp
+++ b/lib/StaticAnalyzer/Core/ProgramState.cpp
@@ -644,15 +644,33 @@ ProgramStateRef ProgramState::addTaint(const Stmt *S,
   if (const Expr *E = dyn_cast_or_null<Expr>(S))
     S = E->IgnoreParens();
 
-  SymbolRef Sym = getSVal(S, LCtx).getAsSymbol();
+  return addTaint(getSVal(S, LCtx), Kind);
+}
+
+ProgramStateRef ProgramState::addTaint(SVal V,
+                                       TaintTagType Kind) const {
+  SymbolRef Sym = V.getAsSymbol();
   if (Sym)
     return addTaint(Sym, Kind);
 
-  const MemRegion *R = getSVal(S, LCtx).getAsRegion();
-  addTaint(R, Kind);
+  // If the SVal represents a structure, try to mass-taint all values within the
+  // structure. For now it only works efficiently on lazy compound values that
+  // were conjured during a conservative evaluation of a function - either as
+  // return values of functions that return structures or arrays by value, or as
+  // values of structures or arrays passed into the function by reference,
+  // directly or through pointer aliasing. Such lazy compound values are
+  // characterized by having exactly one binding in their captured store within
+  // their parent region, which is a conjured symbol default-bound to the base
+  // region of the parent region.
+  if (auto LCV = V.getAs<nonloc::LazyCompoundVal>()) {
+    if (Optional<SVal> binding = getStateManager().StoreMgr->getDefaultBinding(*LCV)) {
+      if (SymbolRef Sym = binding->getAsSymbol())
+        return addPartialTaint(Sym, LCV->getRegion(), Kind);
+    }
+  }
 
-  // Cannot add taint, so just return the state.
-  return this;
+  const MemRegion *R = V.getAsRegion();
+  return addTaint(R, Kind);
 }
 
 ProgramStateRef ProgramState::addTaint(const MemRegion *R,
@@ -674,6 +692,27 @@ ProgramStateRef ProgramState::addTaint(SymbolRef Sym,
   return NewState;
 }
 
+ProgramStateRef ProgramState::addPartialTaint(SymbolRef ParentSym,
+                                              const SubRegion *SubRegion,
+                                              TaintTagType Kind) const {
+  // Ignore partial taint if the entire parent symbol is already tainted.
+  if (contains<TaintMap>(ParentSym) && *get<TaintMap>(ParentSym) == Kind)
+    return this;
+
+  // Partial taint applies if only a portion of the symbol is tainted.
+  if (SubRegion == SubRegion->getBaseRegion())
+    return addTaint(ParentSym, Kind);
+
+  const TaintedSubRegions *SavedRegs = get<DerivedSymTaint>(ParentSym);
+  TaintedSubRegions Regs =
+      SavedRegs ? *SavedRegs : stateMgr->TSRFactory.getEmptyMap();
+
+  Regs = stateMgr->TSRFactory.add(Regs, SubRegion, Kind);
+  ProgramStateRef NewState = set<DerivedSymTaint>(ParentSym, Regs);
+  assert(NewState);
+  return NewState;
+}
+
 bool ProgramState::isTainted(const Stmt *S, const LocationContext *LCtx,
                              TaintTagType Kind) const {
   if (const Expr *E = dyn_cast_or_null<Expr>(S))
@@ -714,31 +753,52 @@ bool ProgramState::isTainted(SymbolRef Sym, TaintTagType Kind) const {
     return false;
 
   // Traverse all the symbols this symbol depends on to see if any are tainted.
-  bool Tainted = false;
   for (SymExpr::symbol_iterator SI = Sym->symbol_begin(), SE =Sym->symbol_end();
        SI != SE; ++SI) {
     if (!isa<SymbolData>(*SI))
       continue;
 
-    const TaintTagType *Tag = get<TaintMap>(*SI);
-    Tainted = (Tag && *Tag == Kind);
+    if (const TaintTagType *Tag = get<TaintMap>(*SI)) {
+      if (*Tag == Kind)
+        return true;
+    }
 
-    // If this is a SymbolDerived with a tainted parent, it's also tainted.
-    if (const SymbolDerived *SD = dyn_cast<SymbolDerived>(*SI))
-      Tainted = Tainted || isTainted(SD->getParentSymbol(), Kind);
+    if (const SymbolDerived *SD = dyn_cast<SymbolDerived>(*SI)) {
+      // If this is a SymbolDerived with a tainted parent, it's also tainted.
+      if (isTainted(SD->getParentSymbol(), Kind))
+        return true;
+
+      // If this is a SymbolDerived with the same parent symbol as another
+      // tainted SymbolDerived and a region that's a sub-region of that tainted
+      // symbol, it's also tainted.
+      if (const TaintedSubRegions *Regs =
+              get<DerivedSymTaint>(SD->getParentSymbol())) {
+        const TypedValueRegion *R = SD->getRegion();
+        for (auto I : *Regs) {
+          // FIXME: The logic to identify tainted regions could be more
+          // complete. For example, this would not currently identify
+          // overlapping fields in a union as tainted. To identify this we can
+          // check for overlapping/nested byte offsets.
+          if (Kind == I.second &&
+              (R == I.first || R->isSubRegionOf(I.first)))
+            return true;
+        }
+      }
+    }
 
     // If memory region is tainted, data is also tainted.
-    if (const SymbolRegionValue *SRV = dyn_cast<SymbolRegionValue>(*SI))
-      Tainted = Tainted || isTainted(SRV->getRegion(), Kind);
-
-    // If If this is a SymbolCast from a tainted value, it's also tainted.
-    if (const SymbolCast *SC = dyn_cast<SymbolCast>(*SI))
-      Tainted = Tainted || isTainted(SC->getOperand(), Kind);
+    if (const SymbolRegionValue *SRV = dyn_cast<SymbolRegionValue>(*SI)) {
+      if (isTainted(SRV->getRegion(), Kind))
+        return true;
+    }
 
-    if (Tainted)
-      return true;
+    // If this is a SymbolCast from a tainted value, it's also tainted.
+    if (const SymbolCast *SC = dyn_cast<SymbolCast>(*SI)) {
+      if (isTainted(SC->getOperand(), Kind))
+        return true;
+    }
   }
 
-  return Tainted;
+  return false;
 }
 
diff --git a/lib/StaticAnalyzer/Core/RegionStore.cpp b/lib/StaticAnalyzer/Core/RegionStore.cpp
index 3000e13d32c6..28f78fa3ff5e 100644
--- a/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -496,7 +496,10 @@ public: // Part of public interface to class.
 
   Optional<SVal> getDefaultBinding(Store S, const MemRegion *R) override {
     RegionBindingsRef B = getRegionBindings(S);
-    return B.getDefaultBinding(R);
+    // Default bindings are always applied over a base region so look up the
+    // base region's default binding, otherwise the lookup will fail when R
+    // is at an offset from R->getBaseRegion().
+    return B.getDefaultBinding(R->getBaseRegion());
   }
 
   SVal getBinding(RegionBindingsConstRef B, Loc L, QualType T = QualType());
diff --git a/test/Analysis/Inputs/system-header-simulator-cxx.h b/test/Analysis/Inputs/system-header-simulator-cxx.h
index 005e7f57af6f..809b768d71e0 100644
--- a/test/Analysis/Inputs/system-header-simulator-cxx.h
+++ b/test/Analysis/Inputs/system-header-simulator-cxx.h
@@ -8,18 +8,61 @@
 typedef unsigned char uint8_t;
 
 typedef __typeof__(sizeof(int)) size_t;
+typedef __typeof__((char*)0-(char*)0) ptrdiff_t;
 void *memmove(void *s1, const void *s2, size_t n);
 
-template <typename T, typename Ptr, typename Ref> struct __iterator {
-  typedef __iterator<T, T *, T &> iterator;
-  typedef __iterator<T, const T *, const T &> const_iterator;
+namespace std {
+  struct input_iterator_tag { };
+  struct output_iterator_tag { };
+  struct forward_iterator_tag : public input_iterator_tag { };
+  struct bidirectional_iterator_tag : public forward_iterator_tag { };
+  struct random_access_iterator_tag : public bidirectional_iterator_tag { };
 
-  __iterator(const Ptr p) : ptr(p) {}
+  template <typename Iterator> struct iterator_traits {
+    typedef typename Iterator::difference_type difference_type;
+    typedef typename Iterator::value_type value_type;
+    typedef typename Iterator::pointer pointer;
+    typedef typename Iterator::reference reference;
+    typedef typename Iterator::iterator_category iterator_category;
+  };
+}
+
+template <typename T, typename Ptr, typename Ref> struct __vector_iterator {
+  typedef __vector_iterator<T, T *, T &> iterator;
+  typedef __vector_iterator<T, const T *, const T &> const_iterator;
+
+  typedef ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef Ptr pointer;
+  typedef Ref reference;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  __vector_iterator(const Ptr p = 0) : ptr(p) {}
+  __vector_iterator(const iterator &rhs): ptr(rhs.base()) {}
+  __vector_iterator<T, Ptr, Ref> operator++() { ++ ptr; return *this; }
+  __vector_iterator<T, Ptr, Ref> operator++(int) {
+    auto tmp = *this;
+    ++ ptr;
+    return tmp;
+  }
+  __vector_iterator<T, Ptr, Ref> operator--() { -- ptr; return *this; }
+  __vector_iterator<T, Ptr, Ref> operator--(int) {
+    auto tmp = *this; -- ptr;
+    return tmp;
+  }
+  __vector_iterator<T, Ptr, Ref> operator+(difference_type n) {
+    return ptr + n;
+  }
+  __vector_iterator<T, Ptr, Ref> operator-(difference_type n) {
+    return ptr - n;
+  }
+  __vector_iterator<T, Ptr, Ref> operator+=(difference_type n) {
+    return ptr += n;
+  }
+  __vector_iterator<T, Ptr, Ref> operator-=(difference_type n) {
+    return ptr -= n;
+  }
 
-  __iterator<T, Ptr, Ref> operator++() { return *this; }
-  __iterator<T, Ptr, Ref> operator++(int) { return *this; }
-  __iterator<T, Ptr, Ref> operator--() { return *this; }
-  __iterator<T, Ptr, Ref> operator--(int) { return *this; }
   Ref operator*() const { return *ptr; }
   Ptr operator->() const { return *ptr; }
 
@@ -29,10 +72,136 @@ template <typename T, typename Ptr, typename Ref> struct __iterator {
   bool operator!=(const iterator &rhs) const { return ptr != rhs.ptr; }
   bool operator!=(const const_iterator &rhs) const { return ptr != rhs.ptr; }
 
+  const Ptr& base() const { return ptr; }
+
 private:
   Ptr ptr;
 };
 
+template <typename T, typename Ptr, typename Ref> struct __deque_iterator {
+  typedef __deque_iterator<T, T *, T &> iterator;
+  typedef __deque_iterator<T, const T *, const T &> const_iterator;
+
+  typedef ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef Ptr pointer;
+  typedef Ref reference;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  __deque_iterator(const Ptr p = 0) : ptr(p) {}
+  __deque_iterator(const iterator &rhs): ptr(rhs.base()) {}
+  __deque_iterator<T, Ptr, Ref> operator++() { ++ ptr; return *this; }
+  __deque_iterator<T, Ptr, Ref> operator++(int) {
+    auto tmp = *this;
+    ++ ptr;
+    return tmp;
+  }
+  __deque_iterator<T, Ptr, Ref> operator--() { -- ptr; return *this; }
+  __deque_iterator<T, Ptr, Ref> operator--(int) {
+    auto tmp = *this; -- ptr;
+    return tmp;
+  }
+  __deque_iterator<T, Ptr, Ref> operator+(difference_type n) {
+    return ptr + n;
+  }
+  __deque_iterator<T, Ptr, Ref> operator-(difference_type n) {
+    return ptr - n;
+  }
+  __deque_iterator<T, Ptr, Ref> operator+=(difference_type n) {
+    return ptr += n;
+  }
+  __deque_iterator<T, Ptr, Ref> operator-=(difference_type n) {
+    return ptr -= n;
+  }
+
+  Ref operator*() const { return *ptr; }
+  Ptr operator->() const { return *ptr; }
+
+  bool operator==(const iterator &rhs) const { return ptr == rhs.ptr; }
+  bool operator==(const const_iterator &rhs) const { return ptr == rhs.ptr; }
+
+  bool operator!=(const iterator &rhs) const { return ptr != rhs.ptr; }
+  bool operator!=(const const_iterator &rhs) const { return ptr != rhs.ptr; }
+
+  const Ptr& base() const { return ptr; }
+
+private:
+  Ptr ptr;
+};
+
+template <typename T, typename Ptr, typename Ref> struct __list_iterator {
+  typedef __list_iterator<T, __typeof__(T::data) *, __typeof__(T::data) &> iterator;
+  typedef __list_iterator<T, const __typeof__(T::data) *, const __typeof__(T::data) &> const_iterator;
+
+  typedef ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef Ptr pointer;
+  typedef Ref reference;
+  typedef std::bidirectional_iterator_tag iterator_category;
+
+  __list_iterator(T* it = 0) : item(it) {}
+  __list_iterator(const iterator &rhs): item(rhs.base()) {}
+  __list_iterator<T, Ptr, Ref> operator++() { item = item->next; return *this; }
+  __list_iterator<T, Ptr, Ref> operator++(int) {
+    auto tmp = *this;
+    item = item->next;
+    return tmp;
+  }
+  __list_iterator<T, Ptr, Ref> operator--() { item = item->prev; return *this; }
+  __list_iterator<T, Ptr, Ref> operator--(int) {
+    auto tmp = *this;
+    item = item->prev;
+    return tmp;
+  }
+
+  Ref operator*() const { return item->data; }
+  Ptr operator->() const { return item->data; }
+
+  bool operator==(const iterator &rhs) const { return item == rhs->item; }
+  bool operator==(const const_iterator &rhs) const { return item == rhs->item; }
+
+  bool operator!=(const iterator &rhs) const { return item != rhs->item; }
+  bool operator!=(const const_iterator &rhs) const { return item != rhs->item; }
+
+  const T* &base() const { return item; }
+
+private:
+  T* item;
+};
+
+template <typename T, typename Ptr, typename Ref> struct __fwdl_iterator {
+  typedef __fwdl_iterator<T, __typeof__(T::data) *, __typeof__(T::data) &> iterator;
+  typedef __fwdl_iterator<T, const __typeof__(T::data) *, const __typeof__(T::data) &> const_iterator;
+
+  typedef ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef Ptr pointer;
+  typedef Ref reference;
+  typedef std::forward_iterator_tag iterator_category;
+
+  __fwdl_iterator(T* it = 0) : item(it) {}
+  __fwdl_iterator(const iterator &rhs): item(rhs.base()) {}
+  __fwdl_iterator<T, Ptr, Ref> operator++() { item = item->next; return *this; }
+  __fwdl_iterator<T, Ptr, Ref> operator++(int) {
+    auto tmp = *this;
+    item = item->next;
+    return tmp;
+  }
+  Ref operator*() const { return item->data; }
+  Ptr operator->() const { return item->data; }
+
+  bool operator==(const iterator &rhs) const { return item == rhs->item; }
+  bool operator==(const const_iterator &rhs) const { return item == rhs->item; }
+
+  bool operator!=(const iterator &rhs) const { return item != rhs->item; }
+  bool operator!=(const const_iterator &rhs) const { return item != rhs->item; }
+
+  const T* &base() const { return item; }
+
+private:
+  T* item;
+};
+
 namespace std {
   template <class T1, class T2>
   struct pair {
@@ -43,30 +212,124 @@ namespace std {
     pair(const T1 &a, const T2 &b) : first(a), second(b) {}
     
     template<class U1, class U2>
-    pair(const pair<U1, U2> &other) : first(other.first), second(other.second) {}
+    pair(const pair<U1, U2> &other) : first(other.first),
+                                      second(other.second) {}
   };
   
   typedef __typeof__(sizeof(int)) size_t;
+
+  template <class T> class initializer_list;
   
+  template< class T > struct remove_reference      {typedef T type;};
+  template< class T > struct remove_reference<T&>  {typedef T type;};
+  template< class T > struct remove_reference<T&&> {typedef T type;};
+
+  template<class T> 
+  typename remove_reference<T>::type&& move(T&& a) {
+    typedef typename remove_reference<T>::type&& RvalRef;
+    return static_cast<RvalRef>(a);
+  }
+
   template<typename T>
   class vector {
-    typedef __iterator<T, T *, T &> iterator;
-    typedef __iterator<T, const T *, const T &> const_iterator;
+    typedef T value_type;
+    typedef size_t size_type;
+    typedef __vector_iterator<T, T *, T &> iterator;
+    typedef __vector_iterator<T, const T *, const T &> const_iterator;
 
     T *_start;
     T *_finish;
     T *_end_of_storage;
   public:
     vector() : _start(0), _finish(0), _end_of_storage(0) {}
+    template <typename InputIterator>
+    vector(InputIterator first, InputIterator last);
+    vector(const vector &other);
+    vector(vector &&other);
     ~vector();
     
     size_t size() const {
       return size_t(_finish - _start);
     }
+
+    T &operator[](size_t n) {
+      return _start[n];
+    }
+    
+    const T &operator[](size_t n) const {
+      return _start[n];
+    }
+    
+    iterator begin() { return iterator(_start); }
+    const_iterator begin() const { return const_iterator(_start); }
+    const_iterator cbegin() const { return const_iterator(_start); }
+    iterator end() { return iterator(_finish); }
+    const_iterator end() const { return const_iterator(_finish); }
+    const_iterator cend() const { return const_iterator(_finish); }
+    T& front() { return *begin(); }
+    const T& front() const { return *begin(); }
+    T& back() { return *(end() - 1); }
+    const T& back() const { return *(end() - 1); }
+  };
+  
+  template<typename T>
+  class list {
+    struct __item {
+      T data;
+      __item *prev, *next;
+    } *_start, *_finish;
+  public:
+    typedef T value_type;
+    typedef size_t size_type;
+    typedef __list_iterator<__item, T *, T &> iterator;
+    typedef __list_iterator<__item, const T *, const T &> const_iterator;
+
+    list() : _start(0), _finish(0) {}
+    template <typename InputIterator>
+    list(InputIterator first, InputIterator last);
+    list(const list &other);
+    list(list &&other);
+    ~list();
     
-    void push_back();
-    T pop_back();
+    list& operator=(const list &other);
+    list& operator=(list &&other);
+    list& operator=(std::initializer_list<T> ilist);
+
+    iterator begin() { return iterator(_start); }
+    const_iterator begin() const { return const_iterator(_start); }
+    const_iterator cbegin() const { return const_iterator(_start); }
+    iterator end() { return iterator(_finish); }
+    const_iterator end() const { return const_iterator(_finish); }
+    const_iterator cend() const { return const_iterator(_finish); }
+
+    T& front() { return *begin(); }
+    const T& front() const { return *begin(); }
+    T& back() { return *--end(); }
+    const T& back() const { return *--end(); }
+  };
 
+  template<typename T>
+  class deque {
+    typedef T value_type;
+    typedef size_t size_type;
+    typedef __deque_iterator<T, T *, T &> iterator;
+    typedef __deque_iterator<T, const T *, const T &> const_iterator;
+
+    T *_start;
+    T *_finish;
+    T *_end_of_storage;
+  public:
+    deque() : _start(0), _finish(0), _end_of_storage(0) {}
+    template <typename InputIterator>
+    deque(InputIterator first, InputIterator last);
+    deque(const deque &other);
+    deque(deque &&other);
+    ~deque();
+    
+    size_t size() const {
+      return size_t(_finish - _start);
+    }
+    
     T &operator[](size_t n) {
       return _start[n];
     }
@@ -77,10 +340,46 @@ namespace std {
     
     iterator begin() { return iterator(_start); }
     const_iterator begin() const { return const_iterator(_start); }
+    const_iterator cbegin() const { return const_iterator(_start); }
     iterator end() { return iterator(_finish); }
     const_iterator end() const { return const_iterator(_finish); }
+    const_iterator cend() const { return const_iterator(_finish); }
+    T& front() { return *begin(); }
+    const T& front() const { return *begin(); }
+    T& back() { return *(end() - 1); }
+    const T& back() const { return *(end() - 1); }
   };
   
+  template<typename T>
+  class forward_list {
+    struct __item {
+      T data;
+      __item *next;
+    } *_start;
+  public:
+    typedef T value_type;
+    typedef size_t size_type;
+    typedef __fwdl_iterator<__item, T *, T &> iterator;
+    typedef __fwdl_iterator<__item, const T *, const T &> const_iterator;
+
+    forward_list() : _start(0) {}
+    template <typename InputIterator>
+    forward_list(InputIterator first, InputIterator last);
+    forward_list(const forward_list &other);
+    forward_list(forward_list &&other);
+    ~forward_list();
+    
+    iterator begin() { return iterator(_start); }
+    const_iterator begin() const { return const_iterator(_start); }
+    const_iterator cbegin() const { return const_iterator(_start); }
+    iterator end() { return iterator(); }
+    const_iterator end() const { return const_iterator(); }
+    const_iterator cend() const { return const_iterator(); }
+
+    T& front() { return *begin(); }
+    const T& front() const { return *begin(); }
+  };
+
   class exception {
   public:
     exception() throw();
@@ -247,41 +546,41 @@ namespace std {
   OutputIter copy_backward(InputIter II, InputIter IE, OutputIter OI) {
     return __copy_backward(II, IE, OI);
   }
+}
+
+template <class BidirectionalIterator, class Distance>
+void __advance (BidirectionalIterator& it, Distance n,
+                std::bidirectional_iterator_tag) {
+  if (n >= 0) while(n-- > 0) ++it; else while (n++<0) --it;
+}
+
+template <class RandomAccessIterator, class Distance>
+void __advance (RandomAccessIterator& it, Distance n,
+                std::random_access_iterator_tag) {
+  it += n;
+}
+
+namespace std {
+  template <class InputIterator, class Distance>
+  void advance (InputIterator& it, Distance n) {
+    __advance(it, n, typename InputIterator::iterator_category());
+  }
+
+  template <class BidirectionalIterator>
+  BidirectionalIterator
+  prev (BidirectionalIterator it,
+        typename iterator_traits<BidirectionalIterator>::difference_type n =
+        1) {
+    advance(it, -n);
+    return it;
+  }
 
   template <class InputIterator, class T>
   InputIterator find(InputIterator first, InputIterator last, const T &val);
-  template <class ForwardIterator1, class ForwardIterator2>
-  ForwardIterator1 find_end(ForwardIterator1 first1, ForwardIterator1 last1,
-                            ForwardIterator2 first2, ForwardIterator2 last2);
-  template <class ForwardIterator1, class ForwardIterator2>
-  ForwardIterator1 find_first_of(ForwardIterator1 first1,
-                                 ForwardIterator1 last1,
-                                 ForwardIterator2 first2,
-                                 ForwardIterator2 last2);
-  template <class InputIterator, class UnaryPredicate>
-  InputIterator find_if(InputIterator first, InputIterator last,
-                        UnaryPredicate pred);
-  template <class InputIterator, class UnaryPredicate>
-  InputIterator find_if_not(InputIterator first, InputIterator last,
-                            UnaryPredicate pred);
-  template <class InputIterator, class T>
-  InputIterator lower_bound(InputIterator first, InputIterator last,
-                            const T &val);
-  template <class InputIterator, class T>
-  InputIterator upper_bound(InputIterator first, InputIterator last,
-                            const T &val);
-  template <class ForwardIterator1, class ForwardIterator2>
-  ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                          ForwardIterator2 first2, ForwardIterator2 last2);
-  template <class ForwardIterator1, class ForwardIterator2>
-  ForwardIterator1 search_n(ForwardIterator1 first1, ForwardIterator1 last1,
-                            ForwardIterator2 first2, ForwardIterator2 last2);
 
-  struct input_iterator_tag { };
-  struct output_iterator_tag { };
-  struct forward_iterator_tag : public input_iterator_tag { };
-  struct bidirectional_iterator_tag : public forward_iterator_tag { };
-  struct random_access_iterator_tag : public bidirectional_iterator_tag { };
+  template <class InputIterator, class OutputIterator>
+  OutputIterator copy(InputIterator first, InputIterator last,
+                      OutputIterator result);
 
 }
 
diff --git a/test/Analysis/diagnostics/explicit-suppression.cpp b/test/Analysis/diagnostics/explicit-suppression.cpp
index 193846c082bc..96d2b4a7d095 100644
--- a/test/Analysis/diagnostics/explicit-suppression.cpp
+++ b/test/Analysis/diagnostics/explicit-suppression.cpp
@@ -19,6 +19,6 @@ class C {
 void testCopyNull(C *I, C *E) {
   std::copy(I, E, (C *)0);
 #ifndef SUPPRESSED
-  // expected-warning@../Inputs/system-header-simulator-cxx.h:191 {{Called C++ object pointer is null}}
+  // expected-warning@../Inputs/system-header-simulator-cxx.h:490 {{Called C++ object pointer is null}}
 #endif
 }
diff --git a/test/Analysis/iterator-past-end.cpp b/test/Analysis/iterator-past-end.cpp
deleted file mode 100644
index 252d1044bd16..000000000000
--- a/test/Analysis/iterator-past-end.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorPastEnd -analyzer-eagerly-assume -analyzer-config c++-container-inlining=false %s -verify
-// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorPastEnd -analyzer-eagerly-assume -analyzer-config c++-container-inlining=true -DINLINE=1 %s -verify
-
-#include "Inputs/system-header-simulator-cxx.h"
-
-void simple_good(const std::vector<int> &v) {
-  auto i = v.end();
-  if (i != v.end())
-    *i; // no-warning
-}
-
-void simple_good_negated(const std::vector<int> &v) {
-  auto i = v.end();
-  if (!(i == v.end()))
-    *i; // no-warning
-}
-
-void simple_bad(const std::vector<int> &v) {
-  auto i = v.end();
-  *i; // expected-warning{{Iterator accessed past its end}}
-}
-
-void copy(const std::vector<int> &v) {
-  auto i1 = v.end();
-  auto i2 = i1;
-  *i2; // expected-warning{{Iterator accessed past its end}}
-}
-
-void decrease(const std::vector<int> &v) {
-  auto i = v.end();
-  --i;
-  *i; // no-warning
-}
-
-void copy_and_decrease1(const std::vector<int> &v) {
-  auto i1 = v.end();
-  auto i2 = i1;
-  --i1;
-  *i1; // no-warning
-}
-
-void copy_and_decrease2(const std::vector<int> &v) {
-  auto i1 = v.end();
-  auto i2 = i1;
-  --i1;
-  *i2; // expected-warning{{Iterator accessed past its end}}
-}
-
-void copy_and_increase1(const std::vector<int> &v) {
-  auto i1 = v.begin();
-  auto i2 = i1;
-  ++i1;
-  if (i1 == v.end())
-    *i2; // no-warning
-}
-
-void copy_and_increase2(const std::vector<int> &v) {
-  auto i1 = v.begin();
-  auto i2 = i1;
-  ++i1;
-  if (i2 == v.end())
-    *i2; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_find(std::vector<int> &vec, int e) {
-  auto first = std::find(vec.begin(), vec.end(), e);
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_find(std::vector<int> &vec, int e) {
-  auto first = std::find(vec.begin(), vec.end(), e);
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_find_end(std::vector<int> &vec, std::vector<int> &seq) {
-  auto last = std::find_end(vec.begin(), vec.end(), seq.begin(), seq.end());
-  if (vec.end() != last)
-    *last; // no-warning
-}
-
-void bad_find_end(std::vector<int> &vec, std::vector<int> &seq) {
-  auto last = std::find_end(vec.begin(), vec.end(), seq.begin(), seq.end());
-  *last; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_find_first_of(std::vector<int> &vec, std::vector<int> &seq) {
-  auto first =
-      std::find_first_of(vec.begin(), vec.end(), seq.begin(), seq.end());
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_find_first_of(std::vector<int> &vec, std::vector<int> &seq) {
-  auto first = std::find_end(vec.begin(), vec.end(), seq.begin(), seq.end());
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-bool odd(int i) { return i % 2; }
-
-void good_find_if(std::vector<int> &vec) {
-  auto first = std::find_if(vec.begin(), vec.end(), odd);
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_find_if(std::vector<int> &vec, int e) {
-  auto first = std::find_if(vec.begin(), vec.end(), odd);
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_find_if_not(std::vector<int> &vec) {
-  auto first = std::find_if_not(vec.begin(), vec.end(), odd);
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_find_if_not(std::vector<int> &vec, int e) {
-  auto first = std::find_if_not(vec.begin(), vec.end(), odd);
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_lower_bound(std::vector<int> &vec, int e) {
-  auto first = std::lower_bound(vec.begin(), vec.end(), e);
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_lower_bound(std::vector<int> &vec, int e) {
-  auto first = std::lower_bound(vec.begin(), vec.end(), e);
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_upper_bound(std::vector<int> &vec, int e) {
-  auto last = std::lower_bound(vec.begin(), vec.end(), e);
-  if (vec.end() != last)
-    *last; // no-warning
-}
-
-void bad_upper_bound(std::vector<int> &vec, int e) {
-  auto last = std::lower_bound(vec.begin(), vec.end(), e);
-  *last; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_search(std::vector<int> &vec, std::vector<int> &seq) {
-  auto first = std::search(vec.begin(), vec.end(), seq.begin(), seq.end());
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_search(std::vector<int> &vec, std::vector<int> &seq) {
-  auto first = std::search(vec.begin(), vec.end(), seq.begin(), seq.end());
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_search_n(std::vector<int> &vec, std::vector<int> &seq) {
-  auto nth = std::search_n(vec.begin(), vec.end(), seq.begin(), seq.end());
-  if (vec.end() != nth)
-    *nth; // no-warning
-}
-
-void bad_search_n(std::vector<int> &vec, std::vector<int> &seq) {
-  auto nth = std::search_n(vec.begin(), vec.end(), seq.begin(), seq.end());
-  *nth; // expected-warning{{Iterator accessed past its end}}
-}
-
-template <class InputIterator, class T>
-InputIterator nonStdFind(InputIterator first, InputIterator last,
-                         const T &val) {
-  for (auto i = first; i != last; ++i) {
-    if (*i == val) {
-      return i;
-    }
-  }
-  return last;
-}
-
-void good_non_std_find(std::vector<int> &vec, int e) {
-  auto first = nonStdFind(vec.begin(), vec.end(), e);
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_non_std_find(std::vector<int> &vec, int e) {
-  auto first = nonStdFind(vec.begin(), vec.end(), e);
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void tricky(std::vector<int> &vec, int e) {
-  const auto first = vec.begin();
-  const auto comp1 = (first != vec.end()), comp2 = (first == vec.end());
-  if (comp1)
-    *first;
-}
-
-void loop(std::vector<int> &vec, int e) {
-  auto start = vec.begin();
-  while (true) {
-    auto item = std::find(start, vec.end(), e);
-    if (item == vec.end())
-      break;
-    *item;          // no-warning
-    start = ++item; // no-warning
-  }
-}
diff --git a/test/Analysis/iterator-range.cpp b/test/Analysis/iterator-range.cpp
new file mode 100644
index 000000000000..79b45188ab5d
--- /dev/null
+++ b/test/Analysis/iterator-range.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorRange -analyzer-eagerly-assume -analyzer-config c++-container-inlining=false %s -verify
+// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorRange -analyzer-eagerly-assume -analyzer-config c++-container-inlining=true -DINLINE=1 %s -verify
+
+#include "Inputs/system-header-simulator-cxx.h"
+
+void clang_analyzer_warnIfReached();
+
+void simple_good_end(const std::vector<int> &v) {
+  auto i = v.end();
+  if (i != v.end()) {
+    clang_analyzer_warnIfReached();
+    *i; // no-warning
+  }
+}
+
+void simple_bad_end(const std::vector<int> &v) {
+  auto i = v.end();
+  *i; // expected-warning{{Iterator accessed outside of its range}}
+}
diff --git a/test/Analysis/pthreadlock.c b/test/Analysis/pthreadlock.c
index 98868172523d..56a92d7d4d92 100644
--- a/test/Analysis/pthreadlock.c
+++ b/test/Analysis/pthreadlock.c
@@ -176,6 +176,42 @@ ok22(void) {
   pthread_mutex_unlock(pmtx);  // no-warning
 }
 
+void ok23(void) {
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_destroy(&mtx1);        // no-warning
+}
+
+void ok24(void) {
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_lock(&mtx1);           // no-warning
+}
+
+void ok25(void) {
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_unlock(&mtx1);         // no-warning
+}
+
+void ok26(void) {
+  pthread_mutex_unlock(&mtx1);           // no-warning
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_lock(&mtx1);           // no-warning
+}
+
+void ok27(void) {
+  pthread_mutex_unlock(&mtx1);           // no-warning
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_lock(&mtx1);           // no-warning
+  else
+    pthread_mutex_init(&mtx1, NULL); // no-warning
+}
+
+void ok28(void) {
+  if (pthread_mutex_destroy(&mtx1) != 0) { // no-warning
+    pthread_mutex_lock(&mtx1);             // no-warning
+    pthread_mutex_unlock(&mtx1);           // no-warning
+    pthread_mutex_destroy(&mtx1);          // no-warning
+  }
+}
 
 void
 bad1(void)
@@ -392,3 +428,46 @@ bad26(void)
 	pthread_mutex_unlock(&mtx1);		// no-warning
 	pthread_mutex_init(&mtx1, NULL);	// expected-warning{{This lock has already been initialized}}
 }
+
+void bad27(void) {
+  pthread_mutex_unlock(&mtx1);            // no-warning
+  int ret = pthread_mutex_destroy(&mtx1); // no-warning
+  if (ret != 0)                           // no-warning
+    pthread_mutex_lock(&mtx1);            // no-warning
+  else
+    pthread_mutex_unlock(&mtx1); // expected-warning{{This lock has already been destroyed}}
+}
+
+void bad28(void) {
+  pthread_mutex_unlock(&mtx1);            // no-warning
+  int ret = pthread_mutex_destroy(&mtx1); // no-warning
+  if (ret != 0)                           // no-warning
+    pthread_mutex_lock(&mtx1);            // no-warning
+  else
+    pthread_mutex_lock(&mtx1); // expected-warning{{This lock has already been destroyed}}
+}
+
+void bad29(void) {
+  pthread_mutex_lock(&mtx1);             // no-warning
+  pthread_mutex_unlock(&mtx1);           // no-warning
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_init(&mtx1, NULL);     // expected-warning{{This lock has already been initialized}}
+  else
+    pthread_mutex_init(&mtx1, NULL); // no-warning
+}
+
+void bad30(void) {
+  pthread_mutex_lock(&mtx1);             // no-warning
+  pthread_mutex_unlock(&mtx1);           // no-warning
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_init(&mtx1, NULL);     // expected-warning{{This lock has already been initialized}}
+  else
+    pthread_mutex_destroy(&mtx1); // expected-warning{{This lock has already been destroyed}}
+}
+
+void bad31(void) {
+  int ret = pthread_mutex_destroy(&mtx1); // no-warning
+  pthread_mutex_lock(&mtx1);              // expected-warning{{This lock has already been destroyed}}
+  if (ret != 0)
+    pthread_mutex_lock(&mtx1);
+}
diff --git a/test/Analysis/taint-generic.c b/test/Analysis/taint-generic.c
index 8efed66dacbf..d3ca246b82a8 100644
--- a/test/Analysis/taint-generic.c
+++ b/test/Analysis/taint-generic.c
@@ -192,20 +192,41 @@ void testStruct() {
 
 void testStructArray() {
   struct {
-    char buf[16];
-    struct {
-      int length;
-    } st[1];
-  } tainted;
+    int length;
+  } tainted[4];
 
-  char buffer[16];
+  char dstbuf[16], srcbuf[16];
   int sock;
 
   sock = socket(AF_INET, SOCK_STREAM, 0);
-  read(sock, &tainted.buf[0], sizeof(tainted.buf));
-  read(sock, &tainted.st[0], sizeof(tainted.st));
-  // FIXME: tainted.st[0].length should be marked tainted
-  __builtin_memcpy(buffer, tainted.buf, tainted.st[0].length); // no-warning
+  __builtin_memset(srcbuf, 0, sizeof(srcbuf));
+
+  read(sock, &tainted[0], sizeof(tainted));
+  __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+
+  __builtin_memset(&tainted, 0, sizeof(tainted));
+  read(sock, &tainted, sizeof(tainted));
+  __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+
+  __builtin_memset(&tainted, 0, sizeof(tainted));
+  // If we taint element 1, we should not raise an alert on taint for element 0 or element 2
+  read(sock, &tainted[1], sizeof(tainted));
+  __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // no-warning
+  __builtin_memcpy(dstbuf, srcbuf, tainted[2].length); // no-warning
+}
+
+void testUnion() {
+  union {
+    int x;
+    char y[4];
+  } tainted;
+
+  char buffer[4];
+
+  int sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, &tainted.y, sizeof(tainted.y));
+  // FIXME: overlapping regions aren't detected by isTainted yet
+  __builtin_memcpy(buffer, tainted.y, tainted.x);
 }
 
 int testDivByZero() {
diff --git a/test/CodeGen/altivec-ct.c b/test/CodeGen/altivec-ct.c
new file mode 100644
index 000000000000..1a3e14dc1fd5
--- /dev/null
+++ b/test/CodeGen/altivec-ct.c
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -S -O0 -o - %s -target-feature +altivec -target-feature +vsx | FileCheck %s -check-prefix=CHECK -check-prefix=VSX
+// RUN: %clang_cc1 -triple powerpc-linux-gnu -S -O0 -o - %s -target-feature +altivec -target-feature -vsx | FileCheck %s
+
+// REQUIRES: powerpc-registered-target
+
+#include <altivec.h>
+
+// CHECK-LABEL: test1
+// CHECK: vcfsx
+vector float test1(vector int x) {
+  return vec_ctf(x, 0);
+}
+
+// CHECK-LABEL: test2
+// CHECK: vcfux
+vector float test2(vector unsigned int x) {
+  return vec_ctf(x, 0);
+}
+
+#ifdef __VSX__
+// VSX-LABEL: test3
+vector double test3(vector signed long long x) {
+  return vec_ctf(x, 0);
+}
+
+// VSX-LABEL: test4
+vector double test4(vector unsigned long long x) {
+  return vec_ctf(x, 0);
+}
+#endif
+
+// CHECK-LABEL: test5
+// CHECK: vcfsx
+vector float test5(vector int x) {
+  return vec_vcfsx(x, 0);
+}
+
+// CHECK-LABEL: test6
+// CHECK: vcfux
+vector float test6(vector unsigned int x) {
+  return vec_vcfux(x, 0);
+}
+
+// CHECK-LABEL: test7
+// CHECK: vctsxs
+vector int test7(vector float x) {
+  return vec_cts(x, 0);
+}
+
+#ifdef __VSX__
+// VSX-LABEL: test8
+vector signed long long test8(vector double x) {
+  return vec_cts(x, 0);
+}
+
+#endif
+
+// CHECK-LABEL: test9
+// CHECK: vctsxs
+vector int test9(vector float x) {
+  return vec_vctsxs(x, 0);
+}
+
+// CHECK-LABEL: test10
+// CHECK: vctuxs
+vector unsigned test10(vector float x) {
+  return vec_ctu(x, 0);
+}
+
+#ifdef __VSX__
+// VSX-LABEL: test11
+vector unsigned long long test11(vector double x) {
+  return vec_ctu(x, 0);
+}
+
+#endif
+
+// CHECK-LABEL: test12
+// CHECK: vctuxs
+vector unsigned test12(vector float x) {
+  return vec_vctuxs(x, 0);
+}
diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c
index a8b03b5d9b0b..ae7c78e08f86 100644
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
-// RUN:  -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding -emit-llvm -o - %s \
+// RUN:  -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding \
+// RUN:  -disable-O0-optnone -emit-llvm -o - %s \
 // RUN:  | opt -S -mem2reg | FileCheck %s
 
 // REQUIRES: long-tests
@@ -3480,11 +3481,11 @@ float32_t test_vgetq_lane_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: @test_vgetq_lane_f16(
-// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 8
 // CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
-// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
+// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
-// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 8
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
@@ -4542,7 +4543,7 @@ poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
@@ -4550,7 +4551,7 @@ uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
@@ -4559,7 +4560,7 @@ uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
@@ -4568,7 +4569,7 @@ uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 int8x16x2_t test_vld2q_s8(int8_t const * a) {
@@ -4576,7 +4577,7 @@ int8x16x2_t test_vld2q_s8(int8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
@@ -4585,7 +4586,7 @@ int16x8x2_t test_vld2q_s16(int16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
@@ -4594,7 +4595,7 @@ int32x4x2_t test_vld2q_s32(int32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
@@ -4603,7 +4604,7 @@ float16x8x2_t test_vld2q_f16(float16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float>
@@ -4612,7 +4613,7 @@ float32x4x2_t test_vld2q_f32(float32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
@@ -4620,7 +4621,7 @@ poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
@@ -4839,24 +4840,24 @@ poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -4866,24 +4867,24 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -4893,24 +4894,24 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -4920,24 +4921,24 @@ int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -4947,24 +4948,24 @@ int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -4974,24 +4975,24 @@ float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
@@ -5001,24 +5002,24 @@ float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -5283,7 +5284,7 @@ poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
@@ -5291,7 +5292,7 @@ uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
@@ -5300,7 +5301,7 @@ uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
@@ -5309,7 +5310,7 @@ uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 int8x16x3_t test_vld3q_s8(int8_t const * a) {
@@ -5317,7 +5318,7 @@ int8x16x3_t test_vld3q_s8(int8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
@@ -5326,7 +5327,7 @@ int16x8x3_t test_vld3q_s16(int16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
@@ -5335,7 +5336,7 @@ int32x4x3_t test_vld3q_s32(int32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
@@ -5344,7 +5345,7 @@ float16x8x3_t test_vld3q_f16(float16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
@@ -5353,7 +5354,7 @@ float32x4x3_t test_vld3q_f32(float32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
@@ -5361,7 +5362,7 @@ poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
@@ -5580,28 +5581,28 @@ poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -5612,28 +5613,28 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -5644,28 +5645,28 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -5676,28 +5677,28 @@ int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -5708,28 +5709,28 @@ int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -5740,28 +5741,28 @@ float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
@@ -5772,28 +5773,28 @@ float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -6103,7 +6104,7 @@ poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
@@ -6111,7 +6112,7 @@ uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
@@ -6120,7 +6121,7 @@ uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
@@ -6129,7 +6130,7 @@ uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 int8x16x4_t test_vld4q_s8(int8_t const * a) {
@@ -6137,7 +6138,7 @@ int8x16x4_t test_vld4q_s8(int8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
@@ -6146,7 +6147,7 @@ int16x8x4_t test_vld4q_s16(int16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
@@ -6155,7 +6156,7 @@ int32x4x4_t test_vld4q_s32(int32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
@@ -6164,7 +6165,7 @@ float16x8x4_t test_vld4q_f16(float16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
@@ -6173,7 +6174,7 @@ float32x4x4_t test_vld4q_f32(float32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
@@ -6181,7 +6182,7 @@ poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
@@ -6400,32 +6401,32 @@ poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -6437,32 +6438,32 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -6474,32 +6475,32 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -6511,32 +6512,32 @@ int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -6548,32 +6549,32 @@ int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -6585,32 +6586,32 @@ float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
@@ -6622,32 +6623,32 @@ float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -14548,21 +14549,21 @@ float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
 
 // CHECK-LABEL: @test_vsetq_lane_f16(
 // CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
-// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
-// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
+// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 8
+// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 8
 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
 // CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
-// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
+// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
 // CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3
-// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
+// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 8
 // CHECK:   ret <8 x half> [[TMP8]]
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
   return vsetq_lane_f16(*a, b, 3);
@@ -16193,20 +16194,20 @@ void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
 // CHECK:   ret void
 void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
@@ -16214,22 +16215,22 @@ void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16240,22 +16241,22 @@ void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -16266,20 +16267,20 @@ void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
 // CHECK:   ret void
 void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
@@ -16287,22 +16288,22 @@ void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16313,22 +16314,22 @@ void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -16339,22 +16340,22 @@ void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16365,22 +16366,22 @@ void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -16391,20 +16392,20 @@ void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
 // CHECK:   ret void
 void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
@@ -16412,22 +16413,22 @@ void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16735,22 +16736,22 @@ void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16761,22 +16762,22 @@ void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -16787,22 +16788,22 @@ void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16813,22 +16814,22 @@ void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -16839,22 +16840,22 @@ void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16865,22 +16866,22 @@ void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -16891,22 +16892,22 @@ void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17162,23 +17163,23 @@ void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
 // CHECK:   ret void
 void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
@@ -17186,26 +17187,26 @@ void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17217,26 +17218,26 @@ void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -17248,23 +17249,23 @@ void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
 // CHECK:   ret void
 void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
@@ -17272,26 +17273,26 @@ void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17303,26 +17304,26 @@ void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -17334,26 +17335,26 @@ void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17365,26 +17366,26 @@ void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -17396,23 +17397,23 @@ void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
 // CHECK:   ret void
 void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
@@ -17420,26 +17421,26 @@ void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17802,26 +17803,26 @@ void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17833,26 +17834,26 @@ void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -17864,26 +17865,26 @@ void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17895,26 +17896,26 @@ void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -17926,26 +17927,26 @@ void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17957,26 +17958,26 @@ void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -17988,26 +17989,26 @@ void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -18308,26 +18309,26 @@ void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 8
 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
 // CHECK:   ret void
 void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
@@ -18335,30 +18336,30 @@ void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -18371,30 +18372,30 @@ void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -18407,26 +18408,26 @@ void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 8
 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
 // CHECK:   ret void
 void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
@@ -18434,30 +18435,30 @@ void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -18470,30 +18471,30 @@ void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -18506,30 +18507,30 @@ void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -18542,30 +18543,30 @@ void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -18578,26 +18579,26 @@ void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 8
 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
 // CHECK:   ret void
 void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
@@ -18605,30 +18606,30 @@ void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -19046,30 +19047,30 @@ void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -19082,30 +19083,30 @@ void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -19118,30 +19119,30 @@ void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -19154,30 +19155,30 @@ void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -19190,30 +19191,30 @@ void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -19226,30 +19227,30 @@ void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -19262,30 +19263,30 @@ void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -20630,7 +20631,7 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vtrnq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -20640,14 +20641,14 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !30
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -20659,14 +20660,14 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !33
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -20678,14 +20679,14 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !36
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -20695,14 +20696,14 @@ int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !39
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -20714,14 +20715,14 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !42
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -20733,14 +20734,14 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !45
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
@@ -20752,14 +20753,14 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !noalias !48
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -20769,14 +20770,14 @@ float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !51
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -20788,7 +20789,7 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !54
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
@@ -21124,7 +21125,7 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vuzpq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -21134,14 +21135,14 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !84
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21153,14 +21154,14 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !87
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -21172,14 +21173,14 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !90
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -21189,14 +21190,14 @@ int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !93
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21208,14 +21209,14 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !96
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -21227,14 +21228,14 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !99
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
@@ -21246,14 +21247,14 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !noalias !102
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -21263,14 +21264,14 @@ float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !105
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21282,7 +21283,7 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !108
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
@@ -21454,7 +21455,7 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vzipq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -21464,14 +21465,14 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !138
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21483,14 +21484,14 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !141
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -21502,14 +21503,14 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !144
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -21519,14 +21520,14 @@ int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !147
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21538,14 +21539,14 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !150
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -21557,14 +21558,14 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !153
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
@@ -21576,14 +21577,14 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !noalias !156
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -21593,14 +21594,14 @@ float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !159
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21612,7 +21613,7 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !162
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
diff --git a/test/CodeGen/union-align.c b/test/CodeGen/union-align.c
index 89a9456e609d..2055d93d8efc 100644
--- a/test/CodeGen/union-align.c
+++ b/test/CodeGen/union-align.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - | grep load | grep "4 x float" | not grep "align 4"
-// RUN: %clang_cc1 -emit-llvm %s -o - | grep load | grep "4 x float" | grep "align 16"
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | grep load | grep "4 x float" | not grep "align 4"
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | grep load | grep "4 x float" | grep "align 16"
 // PR3432
 // rdar://6536377
 
diff --git a/test/CodeGenOpenCL/bool_cast.cl b/test/CodeGenOpenCL/bool_cast.cl
index 8c86b06577d6..ab40eccf571f 100644
--- a/test/CodeGenOpenCL/bool_cast.cl
+++ b/test/CodeGenOpenCL/bool_cast.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -O0 | FileCheck %s
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -o - -O0 | FileCheck %s
 
 typedef unsigned char uchar4 __attribute((ext_vector_type(4)));
 typedef unsigned int int4 __attribute((ext_vector_type(4)));
diff --git a/test/Lexer/preamble.c b/test/Lexer/preamble.c
index 5b2739abefcc..762271f2e3f1 100644
--- a/test/Lexer/preamble.c
+++ b/test/Lexer/preamble.c
@@ -9,15 +9,12 @@
 #pragma unknown
 #endif
 #ifdef WIBBLE
-#include "honk"
-#else
-int foo();
+#include "foo"
+int bar;
 #endif
 
 // This test checks for detection of the preamble of a file, which
-// includes all of the starting comments and #includes. Note that any
-// changes to the preamble part of this file must be mirrored in
-// Inputs/preamble.txt, since we diff against it.
+// includes all of the starting comments and #includes.
 
 // RUN: %clang_cc1 -print-preamble %s > %t
 // RUN: echo END. >> %t
@@ -33,4 +30,6 @@ int foo();
 // CHECK-NEXT: #endif
 // CHECK-NEXT: #pragma unknown
 // CHECK-NEXT: #endif
+// CHECK-NEXT: #ifdef WIBBLE
+// CHECK-NEXT: #include "foo"
 // CHECK-NEXT: END.
diff --git a/test/Lexer/preamble2.c b/test/Lexer/preamble2.c
new file mode 100644
index 000000000000..499a9a22a5b3
--- /dev/null
+++ b/test/Lexer/preamble2.c
@@ -0,0 +1,19 @@
+// Preamble detection test: header with an include guard.
+#ifndef HEADER_H
+#define HEADER_H
+#include "foo"
+int bar;
+#endif
+
+// This test checks for detection of the preamble of a file, which
+// includes all of the starting comments and #includes.
+
+// RUN: %clang_cc1 -print-preamble %s > %t
+// RUN: echo END. >> %t
+// RUN: FileCheck < %t %s
+
+// CHECK: // Preamble detection test: header with an include guard.
+// CHECK-NEXT: #ifndef HEADER_H
+// CHECK-NEXT: #define HEADER_H
+// CHECK-NEXT: #include "foo"
+// CHECK-NEXT: END.
diff --git a/test/Modules/Inputs/preprocess/a.h b/test/Modules/Inputs/preprocess/a.h
new file mode 100644
index 000000000000..1292e99b4b25
--- /dev/null
+++ b/test/Modules/Inputs/preprocess/a.h
@@ -0,0 +1,2 @@
+#include "c.h"
+T a();
diff --git a/test/Modules/Inputs/preprocess/b.h b/test/Modules/Inputs/preprocess/b.h
new file mode 100644
index 000000000000..e8a9f55968f5
--- /dev/null
+++ b/test/Modules/Inputs/preprocess/b.h
@@ -0,0 +1,2 @@
+#include "c.h"
+T b();
diff --git a/test/Modules/Inputs/preprocess/c.h b/test/Modules/Inputs/preprocess/c.h
new file mode 100644
index 000000000000..718f5dc18295
--- /dev/null
+++ b/test/Modules/Inputs/preprocess/c.h
@@ -0,0 +1,4 @@
+#ifndef C_H
+#define C_H
+using T = int;
+#endif
diff --git a/test/Modules/Inputs/preprocess/module.modulemap b/test/Modules/Inputs/preprocess/module.modulemap
index 943435a953d0..f700db03beac 100644
--- a/test/Modules/Inputs/preprocess/module.modulemap
+++ b/test/Modules/Inputs/preprocess/module.modulemap
@@ -1,2 +1,7 @@
 module fwd { header "fwd.h" export * }
 module file { header "file.h" header "file2.h" export * }
+module nested {
+  module a { header "a.h" }
+  module b { header "b.h" }
+  module c { header "c.h" }
+}
diff --git a/test/Modules/preprocess-nested.cpp b/test/Modules/preprocess-nested.cpp
new file mode 100644
index 000000000000..8fccf137e94f
--- /dev/null
+++ b/test/Modules/preprocess-nested.cpp
@@ -0,0 +1,59 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+
+// RUN: %clang_cc1 -fmodules -fmodules-local-submodule-visibility -fmodule-name=nested -I%S/Inputs/preprocess -x c++-module-map %S/Inputs/preprocess/module.modulemap -E -o %t/no-rewrite.ii
+// RUN: %clang_cc1 -fmodules -fmodules-local-submodule-visibility -fmodule-name=nested -I%S/Inputs/preprocess -x c++-module-map %S/Inputs/preprocess/module.modulemap -E -frewrite-includes -o %t/rewrite.ii
+
+// RUN: FileCheck %s --input-file %t/no-rewrite.ii --check-prefix=CHECK --check-prefix=NO-REWRITE
+// RUN: FileCheck %s --input-file %t/rewrite.ii    --check-prefix=CHECK --check-prefix=REWRITE
+
+// Check that we can build a module from the preprocessed output.
+// FIXME: For now, the files need to exist.
+// RUN: touch %t/a.h %t/b.h %t/c.h
+// RUN: %clang_cc1 -fmodules -fmodules-local-submodule-visibility -fmodule-name=nested -x c++-module-map-cpp-output %t/no-rewrite.ii -emit-module -o %t/no-rewrite.pcm
+// RUN: %clang_cc1 -fmodules -fmodules-local-submodule-visibility -fmodule-name=nested -x c++-module-map-cpp-output %t/rewrite.ii -emit-module -o %t/rewrite.pcm
+
+// Check the module we built works.
+// RUN: %clang_cc1 -fmodules -fmodule-file=%t/no-rewrite.pcm %s -I%t -verify -fno-modules-error-recovery
+// RUN: %clang_cc1 -fmodules -fmodule-file=%t/rewrite.pcm %s -I%t -verify -fno-modules-error-recovery -DREWRITE
+
+// == module map
+// CHECK: # 1 "{{.*}}module.modulemap"
+// CHECK: module nested {
+// CHECK:   module a {
+// CHECK:     header "a.h"
+// CHECK:   }
+// CHECK:   module b {
+// CHECK:     header "b.h"
+// CHECK:   }
+// CHECK:   module c {
+// CHECK:     header "c.h"
+// CHECK:   }
+// CHECK: }
+
+// CHECK: #pragma clang module begin nested.a
+// CHECK: #pragma clang module begin nested.c
+// CHECK: using T = int;
+// CHECK: #pragma clang module end
+// CHECK: T a();
+// CHECK: #pragma clang module end
+
+// CHECK: #pragma clang module begin nested.b
+// CHECK: #pragma clang module import nested.c
+// CHECK-NOT: #pragma clang module begin nested.c
+// CHECK-NOT: using T = int;
+// CHECK-NOT: #pragma clang module end
+// CHECK: T b();
+// CHECK: #pragma clang module end
+
+// CHECK: #pragma clang module import nested.c
+
+#pragma clang module import nested.b
+
+int n = b();
+T c; // expected-error {{must be imported}}
+#ifdef REWRITE
+// expected-note@rewrite.ii:* {{declar}}
+#else
+// expected-note@no-rewrite.ii:* {{declar}}
+#endif
diff --git a/test/Modules/preprocess-unavailable.cpp b/test/Modules/preprocess-unavailable.cpp
new file mode 100644
index 000000000000..e568cd7b5251
--- /dev/null
+++ b/test/Modules/preprocess-unavailable.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -x c++-module-map %s -fmodule-name=a -verify
+module a {
+  module b {
+    requires cplusplus11
+  }
+}
+#pragma clang module contents
+// expected-error@3 {{module 'a.b' requires feature 'cplusplus11'}}
+#pragma clang module begin a.b // expected-note {{entering module 'a' due to this pragma}}
+int f();
+int g() { f(); }
+#pragma clang module end // expected-error {{no matching '#pragma clang module begin'}}
diff --git a/test/OpenMP/target_data_messages.c b/test/OpenMP/target_data_messages.c
index 153b4377290c..fd41e7df2a4c 100644
--- a/test/OpenMP/target_data_messages.c
+++ b/test/OpenMP/target_data_messages.c
@@ -4,7 +4,7 @@ void foo() { }
 
 int main(int argc, char **argv) {
   int a;
-  #pragma omp target data // expected-error {{expected at least one map clause for '#pragma omp target data'}}
+  #pragma omp target data // expected-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}}
   {}
   L1:
     foo();
diff --git a/test/OpenMP/target_enter_data_map_messages.c b/test/OpenMP/target_enter_data_map_messages.c
index 6f5aad1ef650..8a7de3b96eff 100644
--- a/test/OpenMP/target_enter_data_map_messages.c
+++ b/test/OpenMP/target_enter_data_map_messages.c
@@ -4,7 +4,7 @@
 int main(int argc, char **argv) {
 
   int r;
-  #pragma omp target enter data // expected-error {{expected at least one map clause for '#pragma omp target enter data'}}
+  #pragma omp target enter data // expected-error {{expected at least one 'map' clause for '#pragma omp target enter data'}}
 
   #pragma omp target enter data map(r) // expected-error {{map type must be specified for '#pragma omp target enter data'}}
   #pragma omp target enter data map(tofrom: r) // expected-error {{map type 'tofrom' is not allowed for '#pragma omp target enter data'}}
diff --git a/test/OpenMP/target_enter_data_nowait_messages.cpp b/test/OpenMP/target_enter_data_nowait_messages.cpp
index e682e8c47d70..508b1b254906 100644
--- a/test/OpenMP/target_enter_data_nowait_messages.cpp
+++ b/test/OpenMP/target_enter_data_nowait_messages.cpp
@@ -6,7 +6,7 @@ int main(int argc, char **argv) {
   #pragma omp nowait target enter data map(to: i) // expected-error {{expected an OpenMP directive}}
   #pragma omp target nowait enter data map(to: i) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
   #pragma omp target enter nowait data map(to: i) // expected-error {{expected an OpenMP directive}}
-  #pragma omp target enter data nowait() map(to: i) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}} expected-error {{expected at least one map clause for '#pragma omp target enter data'}}
+  #pragma omp target enter data nowait() map(to: i) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}} expected-error {{expected at least one 'map' clause for '#pragma omp target enter data'}}
   #pragma omp target enter data map(to: i) nowait( // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
   #pragma omp target enter data map(to: i) nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
   #pragma omp target enter data map(to: i) nowait device (-10u)
diff --git a/test/OpenMP/target_exit_data_map_messages.c b/test/OpenMP/target_exit_data_map_messages.c
index a9953fbbb4c6..84bd7e79fd03 100644
--- a/test/OpenMP/target_exit_data_map_messages.c
+++ b/test/OpenMP/target_exit_data_map_messages.c
@@ -4,7 +4,7 @@
 int main(int argc, char **argv) {
 
   int r;
-  #pragma omp target exit data // expected-error {{expected at least one map clause for '#pragma omp target exit data'}}
+  #pragma omp target exit data // expected-error {{expected at least one 'map' clause for '#pragma omp target exit data'}}
 
   #pragma omp target exit data map(r) // expected-error {{map type must be specified for '#pragma omp target exit data'}}
   #pragma omp target exit data map(tofrom: r) // expected-error {{map type 'tofrom' is not allowed for '#pragma omp target exit data'}}
diff --git a/test/OpenMP/target_exit_data_nowait_messages.cpp b/test/OpenMP/target_exit_data_nowait_messages.cpp
index cd743d89278e..d0286031f362 100644
--- a/test/OpenMP/target_exit_data_nowait_messages.cpp
+++ b/test/OpenMP/target_exit_data_nowait_messages.cpp
@@ -6,7 +6,7 @@ int main(int argc, char **argv) {
   #pragma omp nowait target exit data map(from: i) // expected-error {{expected an OpenMP directive}}
   #pragma omp target nowait exit data map(from: i) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
   #pragma omp target exit nowait data map(from: i) // expected-error {{expected an OpenMP directive}}
-  #pragma omp target exit data nowait() map(from: i) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}} expected-error {{expected at least one map clause for '#pragma omp target exit data'}}
+  #pragma omp target exit data nowait() map(from: i) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}} expected-error {{expected at least one 'map' clause for '#pragma omp target exit data'}}
   #pragma omp target exit data map(from: i) nowait( // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
   #pragma omp target exit data map(from: i) nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
   #pragma omp target exit data map(from: i) nowait device (-10u)
diff --git a/test/OpenMP/target_map_messages.cpp b/test/OpenMP/target_map_messages.cpp
index 9d166ae9a1d9..b9640b965d50 100644
--- a/test/OpenMP/target_map_messages.cpp
+++ b/test/OpenMP/target_map_messages.cpp
@@ -467,7 +467,7 @@ int main(int argc, char **argv) {
   int y;
   int to, tofrom, always;
   const int (&l)[5] = da;
-#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one map clause for '#pragma omp target data'}}
+#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}}
 #pragma omp target data map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
 #pragma omp target data map() // expected-error {{expected expression}}
 #pragma omp target data map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
diff --git a/test/OpenMP/target_teams_map_messages.cpp b/test/OpenMP/target_teams_map_messages.cpp
index 15839b848532..89425d622c10 100644
--- a/test/OpenMP/target_teams_map_messages.cpp
+++ b/test/OpenMP/target_teams_map_messages.cpp
@@ -464,7 +464,7 @@ int main(int argc, char **argv) {
   int y;
   int to, tofrom, always;
   const int (&l)[5] = da;
-#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one map clause for '#pragma omp target data'}}
+#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}}
 #pragma omp target data map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
 #pragma omp target data map() // expected-error {{expected expression}}
 #pragma omp target data map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
diff --git a/test/SemaOpenCL/arithmetic-conversions.cl b/test/SemaOpenCL/arithmetic-conversions.cl
new file mode 100644
index 000000000000..23103caf85ef
--- /dev/null
+++ b/test/SemaOpenCL/arithmetic-conversions.cl
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -pedantic -fsyntax-only -cl-std=CL1.2
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef long long2 __attribute__((ext_vector_type(2)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+
+kernel void foo1(float2 in, global float2 *out) { *out = in + 0.5;} // expected-error {{scalar operand type has greater rank than the type of the vector element. ('float2' (vector of 2 'float' values) and 'double')}}
+
+kernel void foo2(float2 in, global float2 *out) { *out = 0.5 + in;} // expected-error {{scalar operand type has greater rank than the type of the vector element. ('double' and 'float2' (vector of 2 'float' values))}}
+
+kernel void foo3(float2 in, global float2 *out) { *out = 0.5f + in;}
+
+kernel void foo4(long2 in, global long2 *out) { *out = 5 + in;}
+
+kernel void foo5(float2 in, global float2 *out) {
+    float* f;
+    *out = f + in; // expected-error{{cannot convert between vector and non-scalar values ('float *' and 'float2' (vector of 2 'float' values))}}
+}
+
+kernel void foo6(int2 in, global int2 *out) {
+    int* f;
+    *out = f + in; // expected-error{{cannot convert between vector and non-scalar values ('int *' and 'int2' (vector of 2 'int' values))}}
+}
diff --git a/test/SemaOpenCL/clang-builtin-version.cl b/test/SemaOpenCL/clang-builtin-version.cl
index 8574682ab93b..3e270e64c69d 100644
--- a/test/SemaOpenCL/clang-builtin-version.cl
+++ b/test/SemaOpenCL/clang-builtin-version.cl
@@ -4,13 +4,13 @@
 
 kernel void dse_builtins() {
   int tmp;
-  enqueue_kernel(tmp, tmp, tmp, ^(void) { // expected-warning{{implicit declaration of function 'enqueue_kernel' is invalid in C99}}
+  enqueue_kernel(tmp, tmp, tmp, ^(void) { // expected-error{{implicit declaration of function 'enqueue_kernel' is invalid in OpenCL}}
     return;
   });
-  unsigned size = get_kernel_work_group_size(^(void) { // expected-warning{{implicit declaration of function 'get_kernel_work_group_size' is invalid in C99}}
+  unsigned size = get_kernel_work_group_size(^(void) { // expected-error{{implicit declaration of function 'get_kernel_work_group_size' is invalid in OpenCL}}
     return;
   });
-  size = get_kernel_preferred_work_group_size_multiple(^(void) { // expected-warning{{implicit declaration of function 'get_kernel_preferred_work_group_size_multiple' is invalid in C99}}
+  size = get_kernel_preferred_work_group_size_multiple(^(void) { // expected-error{{implicit declaration of function 'get_kernel_preferred_work_group_size_multiple' is invalid in OpenCL}}
     return;
   });
 }
@@ -18,27 +18,48 @@ kernel void dse_builtins() {
 void pipe_builtins() {
   int tmp;
 
-  read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'read_pipe' is invalid in C99}}
-  write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'write_pipe' is invalid in C99}}
+  foo(void); // expected-error{{implicit declaration of function 'foo' is invalid in OpenCL}}
+  // expected-note@-1{{'foo' declared here}}
+  // expected-error@-2{{expected expression}}
+  boo(); // expected-error{{implicit declaration of function 'boo' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'foo'?}}
 
-  reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'reserve_read_pipe' is invalid in C99}}
-  reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'reserve_write_pipe' is invalid in C99}}
+  read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'read_pipe' is invalid in OpenCL}}
+  write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'write_pipe' is invalid in OpenCL}}
 
-  work_group_reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'work_group_reserve_read_pipe' is invalid in C99}}
-  work_group_reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'work_group_reserve_write_pipe' is invalid in C99}}
+  reserve_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'reserve_read_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{'reserve_read_pipe' declared here}}
+  reserve_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'reserve_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'reserve_read_pipe'?}}
 
-  sub_group_reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'sub_group_reserve_write_pipe' is invalid in C99}}
-  sub_group_reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'sub_group_reserve_read_pipe' is invalid in C99}}
+  work_group_reserve_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'work_group_reserve_read_pipe' is invalid in OpenCL}}
+  // expected-note@-1 2{{'work_group_reserve_read_pipe' declared here}}
+  work_group_reserve_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'work_group_reserve_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'work_group_reserve_read_pipe'?}}
+  // expected-note@-2{{'work_group_reserve_write_pipe' declared here}}
 
-  commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'commit_read_pipe' is invalid in C99}}
-  commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'commit_write_pipe' is invalid in C99}}
+  sub_group_reserve_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'sub_group_reserve_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'work_group_reserve_write_pipe'?}}
+  sub_group_reserve_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'sub_group_reserve_read_pipe' is invalid in OpenCL}}
 
-  work_group_commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'work_group_commit_read_pipe' is invalid in C99}}
-  work_group_commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'work_group_commit_write_pipe' is invalid in C99}}
+  commit_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'commit_read_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{'commit_read_pipe' declared here}}
+  commit_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'commit_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'commit_read_pipe'?}}
 
-  sub_group_commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'sub_group_commit_write_pipe' is invalid in C99}}
-  sub_group_commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'sub_group_commit_read_pipe' is invalid in C99}}
+  work_group_commit_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'work_group_commit_read_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{'work_group_commit_read_pipe' declared here}}
+  // expected-note@-2{{did you mean 'work_group_reserve_read_pipe'?}}
+  work_group_commit_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'work_group_commit_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{'work_group_commit_write_pipe' declared here}}
+  // expected-note@-2{{did you mean 'work_group_commit_read_pipe'?}}
 
-  get_pipe_num_packets(tmp); // expected-warning{{implicit declaration of function 'get_pipe_num_packets' is invalid in C99}}
-  get_pipe_max_packets(tmp); // expected-warning{{implicit declaration of function 'get_pipe_max_packets' is invalid in C99}}
+  sub_group_commit_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'sub_group_commit_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'work_group_commit_write_pipe'?}}
+  sub_group_commit_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'sub_group_commit_read_pipe' is invalid in OpenCL}}
+
+  get_pipe_num_packets(tmp); // expected-error{{implicit declaration of function 'get_pipe_num_packets' is invalid in OpenCL}}
+  // expected-note@-1{{'get_pipe_num_packets' declared here}}
+  get_pipe_max_packets(tmp); // expected-error{{implicit declaration of function 'get_pipe_max_packets' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'get_pipe_num_packets'?}}
 }
diff --git a/test/SemaOpenCL/cond.cl b/test/SemaOpenCL/cond.cl
index 60f70564d861..851947cd390f 100644
--- a/test/SemaOpenCL/cond.cl
+++ b/test/SemaOpenCL/cond.cl
@@ -89,7 +89,7 @@ float2 ntest04(int2 C, int2 X, float2 Y)
 
 float2 ntest05(int2 C, int2 X, float Y)
 {
-  return C ? X : Y; // expected-error {{cannot convert between vector values of different size ('int2' (vector of 2 'int' values) and 'float')}}
+  return C ? X : Y; // expected-error {{scalar operand type has greater rank than the type of the vector element. ('int2' (vector of 2 'int' values) and 'float'}}
 }
 
 char2 ntest06(int2 C, char2 X, char2 Y)
diff --git a/test/SemaOpenCL/to_addr_builtin.cl b/test/SemaOpenCL/to_addr_builtin.cl
index a145626cfcee..56db4abed579 100644
--- a/test/SemaOpenCL/to_addr_builtin.cl
+++ b/test/SemaOpenCL/to_addr_builtin.cl
@@ -10,7 +10,7 @@ void test(void) {
 
   glob = to_global(glob, loc);
 #if __OPENCL_C_VERSION__ < CL_VERSION_2_0
-  // expected-warning@-2{{implicit declaration of function 'to_global' is invalid in C99}}
+  // expected-error@-2{{implicit declaration of function 'to_global' is invalid in OpenCL}}
   // expected-warning@-3{{incompatible integer to pointer conversion assigning to '__global int *' from 'int'}}
 #else
   // expected-error@-5{{invalid number of arguments to function: 'to_global'}}
diff --git a/tools/c-index-test/c-index-test.c b/tools/c-index-test/c-index-test.c
index 1f5d60443197..31ad828a2f10 100644
--- a/tools/c-index-test/c-index-test.c
+++ b/tools/c-index-test/c-index-test.c
@@ -1742,6 +1742,8 @@ int perform_test_load_source(int argc, const char **argv,
       return -1;
 
     if (Repeats > 1) {
+      clang_suspendTranslationUnit(TU);
+
       Err = clang_reparseTranslationUnit(TU, num_unsaved_files, unsaved_files,
                                          clang_defaultReparseOptions(TU));
       if (Err != CXError_Success) {
diff --git a/tools/libclang/CIndex.cpp b/tools/libclang/CIndex.cpp
index 394ee0ec4390..2d92de19d99c 100644
--- a/tools/libclang/CIndex.cpp
+++ b/tools/libclang/CIndex.cpp
@@ -3918,6 +3918,20 @@ void clang_disposeTranslationUnit(CXTranslationUnit CTUnit) {
   }
 }
 
+unsigned clang_suspendTranslationUnit(CXTranslationUnit CTUnit) {
+  if (CTUnit) {
+    ASTUnit *Unit = cxtu::getASTUnit(CTUnit);
+
+    if (Unit && Unit->isUnsafeToFree())
+      return false;
+
+    Unit->ResetForParse();
+    return true;
+  }
+
+  return false;
+}
+
 unsigned clang_defaultReparseOptions(CXTranslationUnit TU) {
   return CXReparse_None;
 }
diff --git a/tools/libclang/libclang.exports b/tools/libclang/libclang.exports
index d9a406e5741b..f3758469cb60 100644
--- a/tools/libclang/libclang.exports
+++ b/tools/libclang/libclang.exports
@@ -305,6 +305,7 @@ clang_remap_getFilenames
 clang_remap_getNumFiles
 clang_reparseTranslationUnit
 clang_saveTranslationUnit
+clang_suspendTranslationUnit
 clang_sortCodeCompletionResults
 clang_toggleCrashRecovery
 clang_tokenize